1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19 ; SSE-LABEL: load_i16_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movdqa (%rdi), %xmm0
24 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
25 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
27 ; SSE-NEXT: movdqa %xmm0, %xmm3
28 ; SSE-NEXT: psrld $16, %xmm3
29 ; SSE-NEXT: movdqa %xmm3, %xmm4
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
31 ; SSE-NEXT: movdqa %xmm0, %xmm5
32 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
34 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
35 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
36 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
37 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
38 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
39 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
40 ; SSE-NEXT: psrlq $48, %xmm1
41 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
43 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
44 ; SSE-NEXT: movd %xmm2, (%rsi)
45 ; SSE-NEXT: movd %xmm4, (%rdx)
46 ; SSE-NEXT: movd %xmm6, (%rcx)
47 ; SSE-NEXT: movd %xmm5, (%r8)
48 ; SSE-NEXT: movd %xmm7, (%r9)
49 ; SSE-NEXT: movd %xmm3, (%r10)
50 ; SSE-NEXT: movd %xmm0, (%rax)
53 ; AVX1-ONLY-LABEL: load_i16_stride7_vf2:
55 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
56 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
57 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
58 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
59 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
60 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
61 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm3
62 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
63 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
64 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
65 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
66 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
67 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm7
68 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
69 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
70 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
71 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
72 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
73 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
74 ; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi)
75 ; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
76 ; AVX1-ONLY-NEXT: vmovd %xmm6, (%rcx)
77 ; AVX1-ONLY-NEXT: vpextrd $2, %xmm5, (%r8)
78 ; AVX1-ONLY-NEXT: vmovd %xmm7, (%r9)
79 ; AVX1-ONLY-NEXT: vmovd %xmm3, (%r10)
80 ; AVX1-ONLY-NEXT: vmovd %xmm0, (%rax)
81 ; AVX1-ONLY-NEXT: retq
83 ; AVX2-SLOW-LABEL: load_i16_stride7_vf2:
85 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
86 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
87 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
88 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
89 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
90 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
91 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
92 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
93 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
94 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
95 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
96 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
97 ; AVX2-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
98 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
99 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
100 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
101 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
102 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
103 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
104 ; AVX2-SLOW-NEXT: vmovd %xmm2, (%rsi)
105 ; AVX2-SLOW-NEXT: vmovd %xmm4, (%rdx)
106 ; AVX2-SLOW-NEXT: vmovd %xmm6, (%rcx)
107 ; AVX2-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
108 ; AVX2-SLOW-NEXT: vmovd %xmm7, (%r9)
109 ; AVX2-SLOW-NEXT: vmovd %xmm3, (%r10)
110 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rax)
111 ; AVX2-SLOW-NEXT: retq
113 ; AVX2-FAST-LABEL: load_i16_stride7_vf2:
114 ; AVX2-FAST: # %bb.0:
115 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
116 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
117 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
118 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
119 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
120 ; AVX2-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
121 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
122 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
123 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
124 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7
125 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
126 ; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8
127 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9
128 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
129 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
130 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
131 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
132 ; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi)
133 ; AVX2-FAST-NEXT: vmovd %xmm4, (%rdx)
134 ; AVX2-FAST-NEXT: vmovd %xmm7, (%rcx)
135 ; AVX2-FAST-NEXT: vmovd %xmm5, (%r8)
136 ; AVX2-FAST-NEXT: vmovd %xmm8, (%r9)
137 ; AVX2-FAST-NEXT: vmovd %xmm3, (%r10)
138 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rax)
139 ; AVX2-FAST-NEXT: retq
141 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf2:
142 ; AVX2-FAST-PERLANE: # %bb.0:
143 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
144 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
145 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1
147 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
148 ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm0, %xmm3
149 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
150 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
151 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
152 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7
153 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
154 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 8(%rdi), %xmm8
155 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm1, %xmm9
156 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
157 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
158 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
159 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
160 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm2, (%rsi)
161 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm4, (%rdx)
162 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm7, (%rcx)
163 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm5, (%r8)
164 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm8, (%r9)
165 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm3, (%r10)
166 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax)
167 ; AVX2-FAST-PERLANE-NEXT: retq
169 ; AVX512-SLOW-LABEL: load_i16_stride7_vf2:
170 ; AVX512-SLOW: # %bb.0:
171 ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
172 ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
173 ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
174 ; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
175 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
176 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
177 ; AVX512-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
178 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
179 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
180 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
181 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
182 ; AVX512-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
183 ; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
184 ; AVX512-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
185 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
186 ; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
187 ; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
188 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
189 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
190 ; AVX512-SLOW-NEXT: vmovd %xmm2, (%rsi)
191 ; AVX512-SLOW-NEXT: vmovd %xmm4, (%rdx)
192 ; AVX512-SLOW-NEXT: vmovd %xmm6, (%rcx)
193 ; AVX512-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
194 ; AVX512-SLOW-NEXT: vmovd %xmm7, (%r9)
195 ; AVX512-SLOW-NEXT: vmovd %xmm3, (%r10)
196 ; AVX512-SLOW-NEXT: vmovd %xmm0, (%rax)
197 ; AVX512-SLOW-NEXT: retq
199 ; AVX512F-FAST-LABEL: load_i16_stride7_vf2:
200 ; AVX512F-FAST: # %bb.0:
201 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
202 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
203 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
204 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
205 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
206 ; AVX512F-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
207 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
208 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
210 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7
211 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
212 ; AVX512F-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8
213 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9
214 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
215 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
216 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
217 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
218 ; AVX512F-FAST-NEXT: vmovd %xmm2, (%rsi)
219 ; AVX512F-FAST-NEXT: vmovd %xmm4, (%rdx)
220 ; AVX512F-FAST-NEXT: vmovd %xmm7, (%rcx)
221 ; AVX512F-FAST-NEXT: vmovd %xmm5, (%r8)
222 ; AVX512F-FAST-NEXT: vmovd %xmm8, (%r9)
223 ; AVX512F-FAST-NEXT: vmovd %xmm3, (%r10)
224 ; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax)
225 ; AVX512F-FAST-NEXT: retq
227 ; AVX512BW-FAST-LABEL: load_i16_stride7_vf2:
228 ; AVX512BW-FAST: # %bb.0:
229 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
230 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
231 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
232 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
233 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
234 ; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
235 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
236 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
237 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
238 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
239 ; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7
240 ; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8
241 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
242 ; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
243 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
244 ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
245 ; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
246 ; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx)
247 ; AVX512BW-FAST-NEXT: vmovd %xmm6, (%rcx)
248 ; AVX512BW-FAST-NEXT: vmovd %xmm5, (%r8)
249 ; AVX512BW-FAST-NEXT: vmovd %xmm7, (%r9)
250 ; AVX512BW-FAST-NEXT: vmovd %xmm3, (%r10)
251 ; AVX512BW-FAST-NEXT: vmovd %xmm8, (%rax)
252 ; AVX512BW-FAST-NEXT: retq
253 %wide.vec = load <14 x i16>, ptr %in.vec, align 64
254 %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
255 %strided.vec1 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
256 %strided.vec2 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
257 %strided.vec3 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
258 %strided.vec4 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
259 %strided.vec5 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
260 %strided.vec6 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
261 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
262 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
263 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
264 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
265 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
266 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
267 store <2 x i16> %strided.vec6, ptr %out.vec6, align 64
271 define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
272 ; SSE-LABEL: load_i16_stride7_vf4:
274 ; SSE-NEXT: movdqa (%rdi), %xmm1
275 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
276 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
277 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
278 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
279 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
280 ; SSE-NEXT: movdqa %xmm0, %xmm5
281 ; SSE-NEXT: pandn %xmm2, %xmm5
282 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
283 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
284 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
285 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
286 ; SSE-NEXT: pand %xmm0, %xmm2
287 ; SSE-NEXT: por %xmm5, %xmm2
288 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
289 ; SSE-NEXT: movdqa %xmm4, %xmm7
290 ; SSE-NEXT: pand %xmm5, %xmm7
291 ; SSE-NEXT: pandn %xmm1, %xmm5
292 ; SSE-NEXT: por %xmm7, %xmm5
293 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
294 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7]
295 ; SSE-NEXT: pand %xmm0, %xmm7
296 ; SSE-NEXT: movdqa %xmm3, %xmm5
297 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
298 ; SSE-NEXT: pandn %xmm5, %xmm0
299 ; SSE-NEXT: por %xmm7, %xmm0
300 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3]
301 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7]
302 ; SSE-NEXT: movdqa %xmm1, %xmm8
303 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
304 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3]
305 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
306 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
307 ; SSE-NEXT: movdqa %xmm3, %xmm10
308 ; SSE-NEXT: movdqa %xmm3, %xmm9
309 ; SSE-NEXT: psrlq $16, %xmm9
310 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
311 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
312 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1]
313 ; SSE-NEXT: pslld $16, %xmm6
314 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
315 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
316 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
317 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
318 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
319 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
320 ; SSE-NEXT: movdqa %xmm1, %xmm10
321 ; SSE-NEXT: psrld $16, %xmm10
322 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
323 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
324 ; SSE-NEXT: psrlq $48, %xmm4
325 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
326 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
327 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
328 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
329 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
330 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
331 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
332 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
333 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
334 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
335 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
336 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
337 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
338 ; SSE-NEXT: movq %xmm2, (%rsi)
339 ; SSE-NEXT: movq %xmm0, (%rdx)
340 ; SSE-NEXT: movq %xmm7, (%rcx)
341 ; SSE-NEXT: movq %xmm8, (%r8)
342 ; SSE-NEXT: movq %xmm6, (%r9)
343 ; SSE-NEXT: movq %xmm10, (%rdi)
344 ; SSE-NEXT: movq %xmm1, (%rax)
347 ; AVX1-ONLY-LABEL: load_i16_stride7_vf4:
348 ; AVX1-ONLY: # %bb.0:
349 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
350 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
351 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
352 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
353 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4
354 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2
355 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
356 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
357 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
358 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
359 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
360 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
361 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
362 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
363 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7]
364 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
365 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
366 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
367 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
368 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
369 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3]
370 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
371 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
372 ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9
373 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
374 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
375 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
376 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
377 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm9
378 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
379 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm10
380 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
381 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
382 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
383 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm10
384 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
385 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
386 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
387 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
388 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7]
389 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
390 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
391 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
392 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
393 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
394 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
395 ; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi)
396 ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rdx)
397 ; AVX1-ONLY-NEXT: vmovq %xmm7, (%rcx)
398 ; AVX1-ONLY-NEXT: vmovq %xmm8, (%r8)
399 ; AVX1-ONLY-NEXT: vmovq %xmm9, (%r9)
400 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%r10)
401 ; AVX1-ONLY-NEXT: vmovq %xmm0, (%rax)
402 ; AVX1-ONLY-NEXT: retq
404 ; AVX2-SLOW-LABEL: load_i16_stride7_vf4:
405 ; AVX2-SLOW: # %bb.0:
406 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
407 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
408 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
409 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
410 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3
411 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4
412 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5
413 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
414 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
415 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
416 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6
417 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
418 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
419 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
420 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
421 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
422 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
423 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
424 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
425 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
426 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
427 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
428 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
429 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
430 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
431 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
432 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
433 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
434 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
435 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
436 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
437 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
438 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
439 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
440 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
441 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
442 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
443 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
444 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
445 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
446 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
447 ; AVX2-SLOW-NEXT: vmovq %xmm1, (%rsi)
448 ; AVX2-SLOW-NEXT: vmovq %xmm6, (%rdx)
449 ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rcx)
450 ; AVX2-SLOW-NEXT: vmovq %xmm4, (%r8)
451 ; AVX2-SLOW-NEXT: vmovq %xmm5, (%r9)
452 ; AVX2-SLOW-NEXT: vmovq %xmm7, (%r10)
453 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
454 ; AVX2-SLOW-NEXT: vzeroupper
455 ; AVX2-SLOW-NEXT: retq
457 ; AVX2-FAST-LABEL: load_i16_stride7_vf4:
458 ; AVX2-FAST: # %bb.0:
459 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
460 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
461 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
462 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
463 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2
464 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3
465 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4
466 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
467 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
468 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
469 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6
470 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
471 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
472 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
473 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
474 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
475 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
476 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
477 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
478 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
479 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
480 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
481 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
482 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm8
483 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4
484 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
485 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
486 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
487 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
488 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7
489 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
490 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
491 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
492 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
493 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
494 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
495 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
496 ; AVX2-FAST-NEXT: vmovq %xmm5, (%rsi)
497 ; AVX2-FAST-NEXT: vmovq %xmm6, (%rdx)
498 ; AVX2-FAST-NEXT: vmovq %xmm2, (%rcx)
499 ; AVX2-FAST-NEXT: vmovq %xmm3, (%r8)
500 ; AVX2-FAST-NEXT: vmovq %xmm4, (%r9)
501 ; AVX2-FAST-NEXT: vmovq %xmm7, (%r10)
502 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
503 ; AVX2-FAST-NEXT: vzeroupper
504 ; AVX2-FAST-NEXT: retq
506 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf4:
507 ; AVX2-FAST-PERLANE: # %bb.0:
508 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
509 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
510 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
511 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
512 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
513 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3
514 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4
515 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
516 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
517 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
518 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
519 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
520 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
521 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
522 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
523 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
524 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
525 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
526 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
527 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
528 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
529 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
530 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
531 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm8
532 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4
533 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
534 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
535 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
536 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
537 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7
538 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
539 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
540 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
541 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
542 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
543 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
544 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
545 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rsi)
546 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rdx)
547 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rcx)
548 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%r8)
549 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r9)
550 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm7, (%r10)
551 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%rax)
552 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
553 ; AVX2-FAST-PERLANE-NEXT: retq
555 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf4:
556 ; AVX512F-SLOW: # %bb.0:
557 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
558 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
559 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2
560 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3
561 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
562 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
563 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
564 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
565 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
566 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
567 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
568 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
569 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
570 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
571 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
572 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
573 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3
574 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
575 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
576 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
577 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
578 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
579 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
580 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
581 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
582 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
583 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
584 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
585 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
586 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
587 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
588 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
589 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
590 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
591 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
592 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
593 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
594 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3
595 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
596 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
597 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
598 ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rsi)
599 ; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
600 ; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rcx)
601 ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%r8)
602 ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r9)
603 ; AVX512F-SLOW-NEXT: vmovq %xmm7, (%r10)
604 ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rax)
605 ; AVX512F-SLOW-NEXT: vzeroupper
606 ; AVX512F-SLOW-NEXT: retq
608 ; AVX512F-FAST-LABEL: load_i16_stride7_vf4:
609 ; AVX512F-FAST: # %bb.0:
610 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
611 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
612 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
613 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
614 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
615 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
616 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
617 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
618 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4
619 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
620 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
621 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
622 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
623 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
624 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
625 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2
626 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3
627 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
628 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
629 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
630 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
631 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
632 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
633 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8
634 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6
635 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
636 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
637 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
638 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
639 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7
640 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
641 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
642 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
643 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
644 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
645 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
646 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
647 ; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi)
648 ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
649 ; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx)
650 ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8)
651 ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r9)
652 ; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10)
653 ; AVX512F-FAST-NEXT: vmovq %xmm2, (%rax)
654 ; AVX512F-FAST-NEXT: vzeroupper
655 ; AVX512F-FAST-NEXT: retq
657 ; AVX512BW-LABEL: load_i16_stride7_vf4:
659 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
660 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
661 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
662 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
663 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
664 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
665 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
666 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
667 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
668 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
669 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
670 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
671 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
672 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
673 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
674 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
675 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
676 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
677 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
678 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
679 ; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
680 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
681 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
682 ; AVX512BW-NEXT: vmovq %xmm7, (%r10)
683 ; AVX512BW-NEXT: vmovq %xmm8, (%rax)
684 ; AVX512BW-NEXT: vzeroupper
685 ; AVX512BW-NEXT: retq
686 %wide.vec = load <28 x i16>, ptr %in.vec, align 64
687 %strided.vec0 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
688 %strided.vec1 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
689 %strided.vec2 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
690 %strided.vec3 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
691 %strided.vec4 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
692 %strided.vec5 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
693 %strided.vec6 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
694 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
695 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
696 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
697 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
698 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
699 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
700 store <4 x i16> %strided.vec6, ptr %out.vec6, align 64
704 define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
705 ; SSE-LABEL: load_i16_stride7_vf8:
707 ; SSE-NEXT: movdqa (%rdi), %xmm0
708 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
709 ; SSE-NEXT: movaps 32(%rdi), %xmm2
710 ; SSE-NEXT: movaps 48(%rdi), %xmm8
711 ; SSE-NEXT: movdqa 80(%rdi), %xmm7
712 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
713 ; SSE-NEXT: movdqa 96(%rdi), %xmm6
714 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
715 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
716 ; SSE-NEXT: movdqa %xmm11, %xmm4
717 ; SSE-NEXT: pandn %xmm3, %xmm4
718 ; SSE-NEXT: movdqa %xmm1, %xmm5
719 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
720 ; SSE-NEXT: pand %xmm11, %xmm5
721 ; SSE-NEXT: por %xmm4, %xmm5
722 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
723 ; SSE-NEXT: movdqa %xmm3, %xmm10
724 ; SSE-NEXT: pandn %xmm5, %xmm10
725 ; SSE-NEXT: movaps %xmm2, %xmm5
726 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2]
727 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535]
728 ; SSE-NEXT: movaps %xmm4, %xmm12
729 ; SSE-NEXT: andnps %xmm5, %xmm12
730 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
731 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
732 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
733 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
734 ; SSE-NEXT: pand %xmm4, %xmm5
735 ; SSE-NEXT: por %xmm12, %xmm5
736 ; SSE-NEXT: pand %xmm3, %xmm5
737 ; SSE-NEXT: por %xmm10, %xmm5
738 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
739 ; SSE-NEXT: movdqa %xmm6, %xmm10
740 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5]
741 ; SSE-NEXT: movdqa %xmm11, %xmm12
742 ; SSE-NEXT: pandn %xmm10, %xmm12
743 ; SSE-NEXT: movdqa %xmm7, %xmm10
744 ; SSE-NEXT: psrld $16, %xmm10
745 ; SSE-NEXT: movdqa %xmm1, %xmm15
746 ; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
747 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
748 ; SSE-NEXT: pand %xmm11, %xmm15
749 ; SSE-NEXT: por %xmm12, %xmm15
750 ; SSE-NEXT: movdqa %xmm3, %xmm13
751 ; SSE-NEXT: pandn %xmm15, %xmm13
752 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535]
753 ; SSE-NEXT: movdqa %xmm10, %xmm12
754 ; SSE-NEXT: pandn %xmm0, %xmm12
755 ; SSE-NEXT: movdqa %xmm9, %xmm15
756 ; SSE-NEXT: pand %xmm10, %xmm15
757 ; SSE-NEXT: por %xmm12, %xmm15
758 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3]
759 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7]
760 ; SSE-NEXT: pand %xmm4, %xmm12
761 ; SSE-NEXT: movaps %xmm2, %xmm15
762 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
763 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1]
764 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7]
765 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
766 ; SSE-NEXT: pandn %xmm15, %xmm4
767 ; SSE-NEXT: movdqa %xmm1, %xmm15
768 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
769 ; SSE-NEXT: por %xmm12, %xmm4
770 ; SSE-NEXT: movdqa %xmm0, %xmm12
771 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
772 ; SSE-NEXT: pand %xmm3, %xmm4
773 ; SSE-NEXT: por %xmm13, %xmm4
774 ; SSE-NEXT: movdqa %xmm1, %xmm13
775 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
776 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
777 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7]
778 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
779 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1]
780 ; SSE-NEXT: pand %xmm11, %xmm15
781 ; SSE-NEXT: pandn %xmm14, %xmm11
782 ; SSE-NEXT: por %xmm15, %xmm11
783 ; SSE-NEXT: movdqa %xmm3, %xmm14
784 ; SSE-NEXT: pandn %xmm11, %xmm14
785 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3]
786 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3]
787 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
788 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1]
789 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3]
790 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
791 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3]
792 ; SSE-NEXT: andps %xmm3, %xmm11
793 ; SSE-NEXT: orps %xmm14, %xmm11
794 ; SSE-NEXT: movdqa %xmm10, %xmm14
795 ; SSE-NEXT: pandn %xmm2, %xmm14
796 ; SSE-NEXT: movdqa %xmm8, %xmm15
797 ; SSE-NEXT: pand %xmm10, %xmm15
798 ; SSE-NEXT: por %xmm14, %xmm15
799 ; SSE-NEXT: movdqa %xmm0, %xmm14
800 ; SSE-NEXT: psrld $16, %xmm14
801 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
802 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7]
803 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
804 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7]
805 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2]
806 ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3]
807 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3]
808 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
809 ; SSE-NEXT: andps %xmm3, %xmm15
810 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
811 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
812 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7]
813 ; SSE-NEXT: pandn %xmm13, %xmm3
814 ; SSE-NEXT: movdqa %xmm2, %xmm13
815 ; SSE-NEXT: psrlq $16, %xmm13
816 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
817 ; SSE-NEXT: por %xmm15, %xmm3
818 ; SSE-NEXT: movdqa %xmm7, %xmm15
819 ; SSE-NEXT: pand %xmm10, %xmm15
820 ; SSE-NEXT: pandn %xmm1, %xmm10
821 ; SSE-NEXT: por %xmm15, %xmm10
822 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3]
823 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
824 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
825 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
826 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
827 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7]
828 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
829 ; SSE-NEXT: psrlq $48, %xmm9
830 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
831 ; SSE-NEXT: movdqa %xmm2, %xmm9
832 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
833 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1]
834 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
835 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
836 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1]
837 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
838 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
839 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
840 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0]
841 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
842 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
843 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7]
844 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
845 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
846 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
847 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
848 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
849 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2]
850 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
851 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
852 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
853 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
854 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
855 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
856 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
857 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
858 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7]
859 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
860 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
861 ; SSE-NEXT: movdqa %xmm5, (%rsi)
862 ; SSE-NEXT: movdqa %xmm4, (%rdx)
863 ; SSE-NEXT: movaps %xmm11, (%rcx)
864 ; SSE-NEXT: movdqa %xmm3, (%r8)
865 ; SSE-NEXT: movapd %xmm13, (%r9)
866 ; SSE-NEXT: movaps %xmm14, (%rdi)
867 ; SSE-NEXT: movapd %xmm1, (%rax)
870 ; AVX1-ONLY-LABEL: load_i16_stride7_vf8:
871 ; AVX1-ONLY: # %bb.0:
872 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
873 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
874 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2
875 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3
876 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
877 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4
878 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
879 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6],xmm1[7]
880 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
881 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
882 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5
883 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7
884 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,3,3]
885 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,3]
886 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
887 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
888 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm5[2],xmm7[2],zero
889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
890 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
891 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8
892 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
893 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
894 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
895 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6],xmm9[7]
896 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
897 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,8,9,6,7,u,u,u,u,u,u]
898 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
899 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
900 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,0,3,3,4,5,6,7]
901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7]
902 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7]
903 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
904 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7]
905 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1]
906 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
907 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3]
908 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,1,0,3]
909 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
910 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm11[1],xmm10[1]
911 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
912 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,1,2,3]
913 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
914 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3,4,5,6,7]
915 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
916 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,6,5,6,7]
917 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2]
918 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7]
919 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7]
920 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
921 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
922 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
923 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
924 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7]
925 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
926 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
927 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
928 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm11
929 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
930 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm12
931 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
932 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
934 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,1,0,3]
935 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
936 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
937 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
938 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
939 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4,5,6,7]
940 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12
941 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
942 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
943 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,5,6,7]
944 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,2,2]
945 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
946 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
947 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,3]
948 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,7,7]
949 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
950 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
951 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,2]
952 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5,6,7]
953 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
954 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
955 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
956 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
958 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
959 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
960 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
961 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
962 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
963 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
964 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
965 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
966 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsi)
967 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rdx)
968 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx)
969 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%r8)
970 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%r9)
971 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%r10)
972 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax)
973 ; AVX1-ONLY-NEXT: retq
975 ; AVX2-SLOW-LABEL: load_i16_stride7_vf8:
976 ; AVX2-SLOW: # %bb.0:
977 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
978 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
979 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
980 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
981 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0
982 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
983 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
984 ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
985 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
986 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
987 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
988 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
989 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
990 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
991 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
992 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7
993 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8
994 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
995 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
996 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
997 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
998 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
999 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1000 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1001 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1002 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1003 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1004 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
1005 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
1006 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1007 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1008 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1009 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1010 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1011 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1012 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1013 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1014 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1015 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1016 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1017 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7]
1018 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
1019 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1020 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1021 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1022 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1023 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1024 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
1025 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1026 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
1027 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1028 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1029 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1030 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1031 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
1032 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
1033 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1034 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
1035 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1036 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1037 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1038 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
1039 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
1040 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1041 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1042 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
1043 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3
1044 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
1045 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
1046 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1047 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1048 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1049 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1050 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1051 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsi)
1052 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rdx)
1053 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rcx)
1054 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, (%r8)
1055 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, (%r9)
1056 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, (%r10)
1057 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rax)
1058 ; AVX2-SLOW-NEXT: vzeroupper
1059 ; AVX2-SLOW-NEXT: retq
1061 ; AVX2-FAST-LABEL: load_i16_stride7_vf8:
1062 ; AVX2-FAST: # %bb.0:
1063 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
1064 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
1065 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
1066 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
1067 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0
1068 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
1069 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1070 ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2
1071 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1072 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1073 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1074 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
1075 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1076 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1077 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1078 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7
1079 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8
1080 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
1081 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
1082 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1083 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1084 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1085 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1086 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1087 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1088 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1089 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1090 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1091 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1092 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1093 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1094 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1095 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1096 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1097 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1098 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1099 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
1100 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1101 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1102 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1103 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1104 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1105 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1106 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1107 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1108 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12
1109 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1110 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1111 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1112 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1113 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
1114 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1115 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1116 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1117 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1118 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1119 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1120 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1121 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1122 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
1123 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1124 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1125 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1126 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1127 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1128 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1129 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1130 ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsi)
1131 ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rdx)
1132 ; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rcx)
1133 ; AVX2-FAST-NEXT: vmovdqa %xmm10, (%r8)
1134 ; AVX2-FAST-NEXT: vmovdqa %xmm11, (%r9)
1135 ; AVX2-FAST-NEXT: vmovdqa %xmm7, (%r10)
1136 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rax)
1137 ; AVX2-FAST-NEXT: vzeroupper
1138 ; AVX2-FAST-NEXT: retq
1140 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf8:
1141 ; AVX2-FAST-PERLANE: # %bb.0:
1142 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1143 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
1144 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
1145 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
1146 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0
1147 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1
1148 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1149 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm2
1150 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1151 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1152 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1153 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
1154 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1155 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1156 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1157 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7
1158 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8
1159 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
1160 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
1161 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1162 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1163 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11
1164 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1165 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1166 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1167 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1168 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1169 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1170 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11
1171 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1172 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1173 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1174 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1175 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1176 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1177 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1178 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
1179 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1180 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1181 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1182 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1183 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1184 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1185 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1186 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1187 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12
1188 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1189 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1190 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1191 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1192 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14
1193 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1194 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1195 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1196 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1197 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1198 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1199 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1200 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1201 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
1202 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1203 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1204 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1205 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1206 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1207 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1208 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1209 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsi)
1210 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rdx)
1211 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, (%rcx)
1212 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%r8)
1213 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%r9)
1214 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%r10)
1215 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rax)
1216 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1217 ; AVX2-FAST-PERLANE-NEXT: retq
1219 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf8:
1220 ; AVX512F-SLOW: # %bb.0:
1221 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1222 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1223 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0
1224 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
1225 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1226 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
1227 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1228 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1229 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4
1230 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
1231 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1232 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
1233 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1234 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1235 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1236 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1237 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1238 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
1239 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
1240 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1241 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1242 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1243 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1244 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1245 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1246 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1247 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
1248 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1249 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1250 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1251 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1252 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1253 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1254 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1255 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1256 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1257 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1258 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1259 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7]
1260 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1261 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1262 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1263 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1264 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1265 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1266 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
1267 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1268 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
1269 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1270 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1271 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1272 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1273 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
1274 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
1275 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1276 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
1277 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1278 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1279 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1280 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
1281 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6]
1282 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1283 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1284 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
1285 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
1286 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
1287 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
1288 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1289 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1290 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1291 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1292 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1293 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%rsi)
1294 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rdx)
1295 ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rcx)
1296 ; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%r8)
1297 ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r9)
1298 ; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r10)
1299 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax)
1300 ; AVX512F-SLOW-NEXT: vzeroupper
1301 ; AVX512F-SLOW-NEXT: retq
1303 ; AVX512F-FAST-LABEL: load_i16_stride7_vf8:
1304 ; AVX512F-FAST: # %bb.0:
1305 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
1306 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
1307 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0
1308 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
1309 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1310 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2
1311 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1312 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1313 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4
1314 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
1315 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1316 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
1317 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1318 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1319 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1320 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1321 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1322 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1323 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1324 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
1325 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1326 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1327 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1328 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1329 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1330 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1331 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
1332 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1333 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1334 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1335 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1336 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1337 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1338 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1339 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1340 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1341 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1342 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1343 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1344 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1345 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1346 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
1347 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12
1348 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
1349 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
1350 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1351 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1352 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1353 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
1354 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11
1355 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1356 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1357 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1358 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1359 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1360 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1361 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1362 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
1363 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1364 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
1365 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1366 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1367 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1368 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1369 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1370 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%rsi)
1371 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%rdx)
1372 ; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rcx)
1373 ; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r8)
1374 ; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r9)
1375 ; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r10)
1376 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax)
1377 ; AVX512F-FAST-NEXT: vzeroupper
1378 ; AVX512F-FAST-NEXT: retq
1380 ; AVX512BW-LABEL: load_i16_stride7_vf8:
1381 ; AVX512BW: # %bb.0:
1382 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1383 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1384 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1385 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1386 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1387 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1388 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1389 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1390 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1391 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1392 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1393 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1394 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1395 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1396 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1397 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1398 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1399 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1400 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
1401 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
1402 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
1403 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
1404 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1405 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r10)
1406 ; AVX512BW-NEXT: vmovdqa %xmm8, (%rax)
1407 ; AVX512BW-NEXT: vzeroupper
1408 ; AVX512BW-NEXT: retq
1409 %wide.vec = load <56 x i16>, ptr %in.vec, align 64
1410 %strided.vec0 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
1411 %strided.vec1 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
1412 %strided.vec2 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
1413 %strided.vec3 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
1414 %strided.vec4 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
1415 %strided.vec5 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
1416 %strided.vec6 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
1417 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1418 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1419 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1420 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1421 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1422 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
1423 store <8 x i16> %strided.vec6, ptr %out.vec6, align 64
1427 define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1428 ; SSE-LABEL: load_i16_stride7_vf16:
1430 ; SSE-NEXT: subq $232, %rsp
1431 ; SSE-NEXT: movdqa 80(%rdi), %xmm11
1432 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1433 ; SSE-NEXT: movdqa 64(%rdi), %xmm9
1434 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1435 ; SSE-NEXT: movdqa 112(%rdi), %xmm12
1436 ; SSE-NEXT: movdqa 128(%rdi), %xmm6
1437 ; SSE-NEXT: movaps 160(%rdi), %xmm5
1438 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill
1439 ; SSE-NEXT: movaps 144(%rdi), %xmm7
1440 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1441 ; SSE-NEXT: movdqa 192(%rdi), %xmm13
1442 ; SSE-NEXT: movdqa 176(%rdi), %xmm15
1443 ; SSE-NEXT: movdqa 208(%rdi), %xmm14
1444 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
1445 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1446 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
1447 ; SSE-NEXT: movdqa %xmm1, %xmm2
1448 ; SSE-NEXT: pandn %xmm0, %xmm2
1449 ; SSE-NEXT: movdqa %xmm15, %xmm0
1450 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
1451 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1452 ; SSE-NEXT: pand %xmm1, %xmm0
1453 ; SSE-NEXT: por %xmm2, %xmm0
1454 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
1455 ; SSE-NEXT: movdqa %xmm3, %xmm2
1456 ; SSE-NEXT: movdqa %xmm3, %xmm10
1457 ; SSE-NEXT: pandn %xmm0, %xmm2
1458 ; SSE-NEXT: movaps %xmm7, %xmm0
1459 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2]
1460 ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
1461 ; SSE-NEXT: movaps %xmm8, %xmm4
1462 ; SSE-NEXT: andnps %xmm0, %xmm4
1463 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
1464 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1465 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3]
1466 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1467 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7]
1468 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1469 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
1470 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1471 ; SSE-NEXT: pand %xmm8, %xmm3
1472 ; SSE-NEXT: por %xmm4, %xmm3
1473 ; SSE-NEXT: pand %xmm10, %xmm3
1474 ; SSE-NEXT: por %xmm2, %xmm3
1475 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1476 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1477 ; SSE-NEXT: movdqa %xmm1, %xmm2
1478 ; SSE-NEXT: pandn %xmm0, %xmm2
1479 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
1480 ; SSE-NEXT: pand %xmm1, %xmm9
1481 ; SSE-NEXT: por %xmm2, %xmm9
1482 ; SSE-NEXT: movdqa %xmm10, %xmm2
1483 ; SSE-NEXT: pandn %xmm9, %xmm2
1484 ; SSE-NEXT: movaps 32(%rdi), %xmm0
1485 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1486 ; SSE-NEXT: movaps 48(%rdi), %xmm3
1487 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1488 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
1489 ; SSE-NEXT: movaps %xmm8, %xmm4
1490 ; SSE-NEXT: andnps %xmm0, %xmm4
1491 ; SSE-NEXT: movdqa (%rdi), %xmm0
1492 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1493 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1494 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7]
1495 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1496 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1497 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1498 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1499 ; SSE-NEXT: pand %xmm8, %xmm3
1500 ; SSE-NEXT: por %xmm4, %xmm3
1501 ; SSE-NEXT: pand %xmm10, %xmm3
1502 ; SSE-NEXT: por %xmm2, %xmm3
1503 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1504 ; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
1505 ; SSE-NEXT: movdqa %xmm1, %xmm2
1506 ; SSE-NEXT: pandn %xmm14, %xmm2
1507 ; SSE-NEXT: psrld $16, %xmm13
1508 ; SSE-NEXT: movdqa %xmm15, %xmm4
1509 ; SSE-NEXT: movdqa %xmm15, %xmm11
1510 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1511 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1512 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
1513 ; SSE-NEXT: pand %xmm1, %xmm4
1514 ; SSE-NEXT: por %xmm2, %xmm4
1515 ; SSE-NEXT: movdqa %xmm10, %xmm0
1516 ; SSE-NEXT: movdqa %xmm10, %xmm14
1517 ; SSE-NEXT: pandn %xmm4, %xmm0
1518 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
1519 ; SSE-NEXT: movdqa %xmm7, %xmm4
1520 ; SSE-NEXT: pandn %xmm12, %xmm4
1521 ; SSE-NEXT: pand %xmm7, %xmm6
1522 ; SSE-NEXT: por %xmm4, %xmm6
1523 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1524 ; SSE-NEXT: movdqa %xmm9, %xmm4
1525 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload
1526 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
1527 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
1528 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
1529 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
1530 ; SSE-NEXT: movdqa %xmm8, %xmm2
1531 ; SSE-NEXT: pandn %xmm4, %xmm2
1532 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
1533 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
1534 ; SSE-NEXT: pand %xmm8, %xmm4
1535 ; SSE-NEXT: por %xmm4, %xmm2
1536 ; SSE-NEXT: pand %xmm10, %xmm2
1537 ; SSE-NEXT: por %xmm0, %xmm2
1538 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1539 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1540 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1541 ; SSE-NEXT: movdqa %xmm1, %xmm4
1542 ; SSE-NEXT: pandn %xmm0, %xmm4
1543 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1544 ; SSE-NEXT: movdqa %xmm3, %xmm0
1545 ; SSE-NEXT: psrld $16, %xmm0
1546 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1547 ; SSE-NEXT: movdqa %xmm15, %xmm5
1548 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1549 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
1550 ; SSE-NEXT: pand %xmm1, %xmm5
1551 ; SSE-NEXT: por %xmm4, %xmm5
1552 ; SSE-NEXT: movdqa %xmm10, %xmm0
1553 ; SSE-NEXT: pandn %xmm5, %xmm0
1554 ; SSE-NEXT: movdqa %xmm7, %xmm4
1555 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1556 ; SSE-NEXT: pandn %xmm10, %xmm4
1557 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1558 ; SSE-NEXT: movdqa %xmm12, %xmm5
1559 ; SSE-NEXT: pand %xmm7, %xmm5
1560 ; SSE-NEXT: por %xmm4, %xmm5
1561 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
1562 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
1563 ; SSE-NEXT: pand %xmm8, %xmm4
1564 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1565 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1566 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
1567 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1]
1568 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
1569 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
1570 ; SSE-NEXT: pandn %xmm5, %xmm8
1571 ; SSE-NEXT: por %xmm4, %xmm8
1572 ; SSE-NEXT: pand %xmm14, %xmm8
1573 ; SSE-NEXT: por %xmm0, %xmm8
1574 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1575 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1]
1577 ; SSE-NEXT: movdqa %xmm1, %xmm4
1578 ; SSE-NEXT: pandn %xmm0, %xmm4
1579 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1580 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
1581 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1]
1582 ; SSE-NEXT: pand %xmm1, %xmm0
1583 ; SSE-NEXT: por %xmm4, %xmm0
1584 ; SSE-NEXT: movdqa %xmm14, %xmm4
1585 ; SSE-NEXT: pandn %xmm0, %xmm4
1586 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3]
1587 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7]
1588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
1589 ; SSE-NEXT: movdqa %xmm13, %xmm2
1590 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1592 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1593 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
1594 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3]
1595 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1596 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3]
1597 ; SSE-NEXT: andps %xmm14, %xmm11
1598 ; SSE-NEXT: orps %xmm4, %xmm11
1599 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1600 ; SSE-NEXT: movdqa %xmm15, %xmm0
1601 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1602 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1603 ; SSE-NEXT: pand %xmm1, %xmm0
1604 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1605 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
1606 ; SSE-NEXT: pandn %xmm4, %xmm1
1607 ; SSE-NEXT: por %xmm0, %xmm1
1608 ; SSE-NEXT: movdqa %xmm14, %xmm0
1609 ; SSE-NEXT: movaps %xmm14, %xmm15
1610 ; SSE-NEXT: pandn %xmm1, %xmm0
1611 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1612 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3]
1613 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
1614 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1615 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3]
1616 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
1617 ; SSE-NEXT: movdqa %xmm10, %xmm1
1618 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
1619 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
1620 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
1621 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3]
1622 ; SSE-NEXT: andps %xmm15, %xmm4
1623 ; SSE-NEXT: orps %xmm0, %xmm4
1624 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; SSE-NEXT: movdqa %xmm7, %xmm0
1626 ; SSE-NEXT: pandn %xmm9, %xmm0
1627 ; SSE-NEXT: pand %xmm7, %xmm2
1628 ; SSE-NEXT: por %xmm0, %xmm2
1629 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7]
1630 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1631 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
1632 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
1633 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
1634 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1635 ; SSE-NEXT: movdqa %xmm4, %xmm5
1636 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
1637 ; SSE-NEXT: movdqa %xmm6, %xmm12
1638 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
1639 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
1640 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1641 ; SSE-NEXT: movdqa %xmm8, %xmm6
1642 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
1643 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
1644 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
1645 ; SSE-NEXT: movaps %xmm15, %xmm2
1646 ; SSE-NEXT: andnps %xmm5, %xmm2
1647 ; SSE-NEXT: andps %xmm15, %xmm0
1648 ; SSE-NEXT: orps %xmm0, %xmm2
1649 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650 ; SSE-NEXT: movdqa %xmm7, %xmm0
1651 ; SSE-NEXT: pandn %xmm13, %xmm0
1652 ; SSE-NEXT: movdqa %xmm13, %xmm9
1653 ; SSE-NEXT: movdqa %xmm14, %xmm5
1654 ; SSE-NEXT: movdqa %xmm14, %xmm13
1655 ; SSE-NEXT: pand %xmm7, %xmm5
1656 ; SSE-NEXT: por %xmm0, %xmm5
1657 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7]
1658 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1659 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1660 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1661 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1662 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1663 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1664 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
1665 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1666 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
1667 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1668 ; SSE-NEXT: andps %xmm15, %xmm0
1669 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1670 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1671 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
1672 ; SSE-NEXT: andnps %xmm1, %xmm15
1673 ; SSE-NEXT: orps %xmm0, %xmm15
1674 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1675 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1676 ; SSE-NEXT: movdqa %xmm0, %xmm14
1677 ; SSE-NEXT: psrld $16, %xmm14
1678 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1679 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
1680 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1681 ; SSE-NEXT: movdqa %xmm0, %xmm5
1682 ; SSE-NEXT: movdqa %xmm2, %xmm0
1683 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1684 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685 ; SSE-NEXT: psrlq $48, %xmm0
1686 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1687 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1688 ; SSE-NEXT: movdqa %xmm8, %xmm0
1689 ; SSE-NEXT: psrlq $16, %xmm0
1690 ; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload
1691 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1692 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1693 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1694 ; SSE-NEXT: movdqa %xmm4, %xmm5
1695 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3]
1696 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
1697 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1698 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1699 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
1700 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1701 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1702 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
1703 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1704 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1705 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1706 ; SSE-NEXT: movdqa %xmm10, %xmm15
1707 ; SSE-NEXT: psrld $16, %xmm15
1708 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1709 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
1710 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
1711 ; SSE-NEXT: movdqa %xmm2, %xmm1
1712 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
1713 ; SSE-NEXT: psrlq $48, %xmm1
1714 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1715 ; SSE-NEXT: movdqa %xmm9, %xmm4
1716 ; SSE-NEXT: movdqa %xmm9, %xmm1
1717 ; SSE-NEXT: psrlq $16, %xmm1
1718 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1719 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
1720 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1721 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1722 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
1723 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1724 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1725 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1726 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1727 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
1728 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1729 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1730 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1731 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1732 ; SSE-NEXT: movdqa %xmm8, %xmm0
1733 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1734 ; SSE-NEXT: movdqa %xmm11, %xmm8
1735 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1736 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1737 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
1738 ; SSE-NEXT: movdqa %xmm7, %xmm0
1739 ; SSE-NEXT: pandn %xmm5, %xmm0
1740 ; SSE-NEXT: movdqa %xmm12, %xmm11
1741 ; SSE-NEXT: pand %xmm7, %xmm11
1742 ; SSE-NEXT: por %xmm0, %xmm11
1743 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
1744 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
1745 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1746 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1747 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2]
1748 ; SSE-NEXT: pand %xmm7, %xmm2
1749 ; SSE-NEXT: movdqa %xmm9, %xmm11
1750 ; SSE-NEXT: pandn %xmm9, %xmm7
1751 ; SSE-NEXT: por %xmm2, %xmm7
1752 ; SSE-NEXT: movdqa %xmm4, %xmm0
1753 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
1754 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1755 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1756 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1757 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
1758 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
1759 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1760 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1761 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2]
1762 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1763 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1764 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1765 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1766 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1767 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1768 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
1769 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
1770 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1771 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7]
1772 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1773 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
1774 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
1775 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
1776 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
1777 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1778 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1]
1779 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1780 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
1781 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
1782 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3]
1783 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
1784 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
1785 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1786 ; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7]
1787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1788 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
1789 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
1790 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1791 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1792 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1]
1793 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1794 ; SSE-NEXT: movaps %xmm2, (%rsi)
1795 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1796 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1798 ; SSE-NEXT: movaps %xmm2, (%rdx)
1799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1800 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
1801 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1802 ; SSE-NEXT: movaps %xmm2, (%rcx)
1803 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1804 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
1805 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1806 ; SSE-NEXT: movaps %xmm2, (%r8)
1807 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1808 ; SSE-NEXT: movaps %xmm2, 16(%r8)
1809 ; SSE-NEXT: movapd %xmm1, (%r9)
1810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1811 ; SSE-NEXT: movaps %xmm1, 16(%r9)
1812 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1813 ; SSE-NEXT: movaps %xmm15, (%rax)
1814 ; SSE-NEXT: movaps %xmm14, 16(%rax)
1815 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1816 ; SSE-NEXT: movapd %xmm11, (%rax)
1817 ; SSE-NEXT: movapd %xmm0, 16(%rax)
1818 ; SSE-NEXT: addq $232, %rsp
1821 ; AVX1-ONLY-LABEL: load_i16_stride7_vf16:
1822 ; AVX1-ONLY: # %bb.0:
1823 ; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108
1824 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0
1825 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1826 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0
1827 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm12
1828 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
1829 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1830 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
1831 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1832 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
1833 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
1834 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1835 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8
1836 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1837 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2
1838 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1839 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1
1840 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1841 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1842 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1843 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1844 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1845 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5
1846 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
1847 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1848 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1849 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1850 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
1851 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
1852 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7
1853 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7]
1854 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6
1855 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3]
1856 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
1857 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11
1858 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
1859 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1860 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9
1861 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
1862 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10
1863 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
1864 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7]
1865 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1866 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1
1867 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2
1868 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero
1869 ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13
1870 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1871 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1872 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7]
1873 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
1874 ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm15, %ymm11
1875 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9
1876 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9
1877 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1878 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7]
1879 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1880 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1881 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
1882 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4
1883 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1884 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1885 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
1886 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7]
1887 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
1888 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7]
1889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
1890 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1891 ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9
1892 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1893 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1894 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
1895 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9
1896 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1897 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1898 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1899 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
1900 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7]
1902 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm11
1903 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5
1904 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
1906 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
1907 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
1908 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1909 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7]
1910 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm13
1911 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3]
1913 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7]
1914 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7]
1915 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1916 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9
1917 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm15
1918 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm15, %ymm9
1919 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1920 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
1921 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0
1922 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm9
1923 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0
1924 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1925 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm0
1926 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1927 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15
1928 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1929 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
1930 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3]
1931 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
1932 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7]
1934 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
1935 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1936 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11
1937 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
1938 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4
1939 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
1940 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1941 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2]
1942 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm2
1943 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7]
1944 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1]
1945 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
1946 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1]
1947 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
1948 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1949 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3]
1950 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
1951 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1952 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3]
1953 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1]
1954 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1955 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3]
1956 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7]
1958 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1959 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9
1960 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
1961 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1
1962 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1963 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
1964 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
1965 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
1966 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
1967 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1968 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
1969 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1970 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
1971 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1972 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7]
1973 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1974 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
1975 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7]
1976 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1977 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7]
1978 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
1979 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7]
1980 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7]
1981 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
1982 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13
1983 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7]
1984 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7]
1985 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1986 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7]
1987 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1988 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11
1989 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
1990 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7]
1991 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1992 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
1993 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
1994 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1995 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm12
1996 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
1997 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1998 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9
1999 ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10
2000 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9
2001 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2002 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2003 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1
2004 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9
2005 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1
2006 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2007 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
2008 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
2009 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
2010 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3]
2011 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
2012 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2013 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
2014 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7]
2015 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2016 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2017 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2018 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2019 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm1
2020 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2021 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
2022 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
2023 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm9
2024 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
2025 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2026 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3]
2027 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
2028 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2029 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2030 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2031 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
2032 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
2033 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2034 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2035 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2036 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0
2037 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1
2038 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
2039 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2040 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
2041 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
2042 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm1
2043 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15
2044 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2045 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2046 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2047 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2048 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2049 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2050 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7]
2051 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2052 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
2053 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2054 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1
2055 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2056 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2057 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
2058 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
2059 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2060 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7]
2061 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
2062 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7]
2063 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2064 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7]
2065 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2]
2066 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2067 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2068 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2069 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2070 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2071 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0
2072 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1
2073 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
2074 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2075 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2076 ; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero
2077 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
2078 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7]
2079 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7]
2080 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2081 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2082 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2083 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1]
2084 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2085 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2086 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2087 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2088 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
2089 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
2090 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
2091 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
2092 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
2093 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3]
2094 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2095 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
2096 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3]
2097 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2098 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
2099 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2100 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1
2101 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2
2102 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
2103 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2104 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi)
2105 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2106 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx)
2107 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2108 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx)
2109 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2110 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8)
2111 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2112 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9)
2113 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
2114 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax)
2115 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
2116 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax)
2117 ; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108
2118 ; AVX1-ONLY-NEXT: vzeroupper
2119 ; AVX1-ONLY-NEXT: retq
2121 ; AVX2-SLOW-LABEL: load_i16_stride7_vf16:
2122 ; AVX2-SLOW: # %bb.0:
2123 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2124 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2125 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
2126 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
2127 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9
2128 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5
2129 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6
2130 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2131 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,4,7]
2132 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
2133 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
2134 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
2135 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
2136 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2137 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[0,1,0,2]
2138 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,1,4,5,6,5]
2139 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2140 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2141 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2142 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2143 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6],xmm8[7]
2144 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2145 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2146 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
2147 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2148 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
2149 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
2150 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2151 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2152 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2153 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2154 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
2155 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2156 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2157 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
2158 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7,8,9,10],ymm12[11],ymm10[12,13,14,15]
2159 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2160 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm10
2161 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8
2162 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2163 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2164 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
2165 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2166 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2167 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2168 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
2169 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
2170 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2171 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2172 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2173 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11
2174 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7]
2175 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2176 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
2177 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2178 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4],ymm12[5,6,7,8,9,10,11],ymm11[12],ymm12[13,14,15]
2179 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2180 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8
2181 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2182 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2183 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
2184 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2185 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2186 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,2]
2187 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7]
2188 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2189 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
2190 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
2191 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
2192 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2193 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2194 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2195 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
2196 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
2197 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2198 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
2199 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
2200 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
2201 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
2202 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2203 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
2204 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7]
2205 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,2,1,0,4,5,6,7]
2206 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
2207 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2208 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,3,2,3]
2209 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2210 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
2211 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2212 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2213 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2214 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2215 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
2216 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2217 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2218 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
2219 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2220 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
2221 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2222 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2223 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
2224 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2225 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
2226 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
2227 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
2228 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
2229 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2230 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1]
2231 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15]
2232 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2233 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2234 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2235 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2236 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
2237 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2238 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2239 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12
2240 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13
2241 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2242 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2243 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
2244 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2245 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
2246 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2247 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2248 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2249 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2250 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4
2251 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2252 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2253 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
2254 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2255 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
2256 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2257 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
2258 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3]
2259 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
2260 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2261 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2262 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2263 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2264 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2265 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2266 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2267 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2268 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2269 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2270 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2271 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2272 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2273 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2274 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
2275 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
2276 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2277 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2278 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2279 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2280 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
2281 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2282 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2283 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2284 ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi)
2285 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
2286 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
2287 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
2288 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9)
2289 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2290 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax)
2291 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2292 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax)
2293 ; AVX2-SLOW-NEXT: vzeroupper
2294 ; AVX2-SLOW-NEXT: retq
2296 ; AVX2-FAST-LABEL: load_i16_stride7_vf16:
2297 ; AVX2-FAST: # %bb.0:
2298 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
2299 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2300 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2
2301 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3
2302 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
2303 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6
2304 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7
2305 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2]
2306 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
2307 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
2308 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5]
2309 ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1]
2310 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
2311 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
2312 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
2313 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2314 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
2315 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
2316 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2317 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2318 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
2319 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2320 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
2321 ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2322 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2323 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2325 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
2326 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
2327 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2328 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2329 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,5,1,u,4,u,u,u]
2330 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm13, %ymm11
2331 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
2332 ; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm11
2333 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm13, %ymm10
2334 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
2335 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2336 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
2337 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2338 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2339 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2340 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
2341 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
2342 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7]
2343 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2344 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2345 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
2346 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7]
2347 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2348 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2349 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,6,1,u,5,u,u,u]
2350 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12
2351 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
2352 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10
2353 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
2354 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2355 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
2356 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2357 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2358 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5]
2359 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm13
2360 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2361 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
2362 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15]
2363 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
2364 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
2365 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2366 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
2367 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2368 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2369 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,1,1,3]
2370 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2371 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
2372 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2373 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2374 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
2375 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
2376 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2377 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,3,2,3]
2378 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2379 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
2380 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15]
2381 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
2382 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2383 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
2384 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15
2385 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14
2386 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
2387 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2388 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2389 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u]
2390 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm9
2391 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
2392 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm5
2393 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm15[1],xmm5[2],xmm15[3],xmm5[4,5,6,7]
2394 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
2395 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2396 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2397 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
2398 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
2399 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9
2400 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3]
2401 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
2402 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7]
2403 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
2404 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4]
2405 ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
2406 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5
2407 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0]
2408 ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
2409 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14
2410 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
2411 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
2412 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15]
2413 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2414 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,4,7,3,6,u,u,u]
2415 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14
2416 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2417 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8
2418 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8
2419 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
2420 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
2421 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
2422 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15]
2423 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3]
2424 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
2425 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0]
2426 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
2427 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
2428 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2429 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5]
2430 ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
2431 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4
2432 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
2433 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
2434 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15]
2435 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2436 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,0,3,7,u,u,u]
2437 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
2438 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2
2439 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2440 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
2441 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
2442 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
2443 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2444 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
2445 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2446 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2447 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2448 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
2449 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2450 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx)
2451 ; AVX2-FAST-NEXT: vmovdqa %ymm10, (%rcx)
2452 ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8)
2453 ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9)
2454 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2455 ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax)
2456 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2457 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
2458 ; AVX2-FAST-NEXT: vzeroupper
2459 ; AVX2-FAST-NEXT: retq
2461 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf16:
2462 ; AVX2-FAST-PERLANE: # %bb.0:
2463 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
2464 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2465 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2
2466 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3
2467 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10
2468 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm5
2469 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6
2470 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2471 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7
2472 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,8,9,10,11,6,7,6,7]
2473 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7]
2474 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
2475 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2476 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2]
2477 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2478 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8
2479 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2480 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2481 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
2482 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7]
2483 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2484 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2485 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
2486 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2487 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
2488 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
2489 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2490 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2491 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2492 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
2493 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7]
2494 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2495 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2496 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
2497 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
2498 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2499 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11
2500 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm13, %ymm8
2501 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2502 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2503 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
2504 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2505 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2506 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2507 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
2508 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2509 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2510 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2511 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2512 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12
2513 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7]
2514 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2515 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
2516 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2517 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15]
2518 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2519 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8
2520 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2521 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2522 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
2523 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2524 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2525 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2]
2526 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
2527 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
2528 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15]
2529 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
2530 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2531 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2532 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
2533 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2534 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13
2535 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3]
2536 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2537 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2538 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2539 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14
2540 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
2541 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
2542 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2543 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3]
2544 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2545 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11
2546 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
2547 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2548 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2549 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
2550 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14
2551 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13
2552 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
2553 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2554 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2555 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2556 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2557 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15]
2558 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
2559 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2560 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm4
2561 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7]
2562 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2563 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2564 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9
2565 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
2566 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15]
2567 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7]
2568 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2569 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12
2570 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7]
2571 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2572 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2573 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm12
2574 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13
2575 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2576 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2577 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm14
2578 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2579 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15]
2580 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2581 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,3,0,1]
2582 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4,5,6],ymm7[7,8],ymm14[9,10,11,12,13,14],ymm7[15]
2583 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7
2584 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2585 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15
2586 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm11
2587 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
2588 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
2589 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
2590 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
2591 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2592 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2593 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
2594 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2595 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2596 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2597 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2598 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2599 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5
2600 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2601 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2602 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2603 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2604 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2605 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2606 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2607 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
2608 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
2609 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
2610 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2611 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
2612 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2613 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2614 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2615 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
2616 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2617 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx)
2618 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx)
2619 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8)
2620 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9)
2621 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2622 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax)
2623 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2624 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax)
2625 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2626 ; AVX2-FAST-PERLANE-NEXT: retq
2628 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf16:
2629 ; AVX512F-SLOW: # %bb.0:
2630 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2631 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2632 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
2633 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
2634 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2635 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2636 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2637 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2638 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
2639 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
2640 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u]
2641 ; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4
2642 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5
2643 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6
2644 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2645 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,4,7]
2646 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
2647 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
2648 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7]
2649 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2650 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2651 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9
2652 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,2]
2653 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5]
2654 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2655 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
2656 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
2657 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17
2658 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2659 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10
2660 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7]
2661 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2662 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2663 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
2664 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
2665 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
2666 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8
2667 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2668 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15]
2669 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2670 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2671 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2672 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
2673 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2674 ; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8
2675 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2676 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2677 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2678 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2679 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
2680 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2681 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2682 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2]
2683 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
2684 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2685 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
2686 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2687 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2688 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15]
2689 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2690 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2691 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2692 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2693 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2694 ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10
2695 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
2696 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
2697 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2698 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3]
2699 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2700 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2701 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2702 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
2703 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2704 ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10
2705 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2706 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2707 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2708 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
2709 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
2710 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2711 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
2712 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
2713 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
2714 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
2715 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2716 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2717 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2718 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2719 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
2720 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2721 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2722 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
2723 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2724 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
2725 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2726 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2727 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2728 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15]
2729 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2730 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
2731 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
2732 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
2733 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
2734 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2735 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
2736 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
2737 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2738 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2739 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2740 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
2741 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2742 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2743 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12
2744 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13
2745 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2746 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2747 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
2748 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2749 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
2750 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2751 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2752 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2753 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2754 ; AVX512F-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
2755 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm16[2,1,2,3]
2756 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2757 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
2758 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2759 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
2760 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2761 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7]
2762 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15]
2763 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
2764 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2765 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2766 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2767 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2768 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2769 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2770 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2771 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2772 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2773 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2774 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2775 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2776 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2777 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2778 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
2779 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
2780 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2781 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2782 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2783 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2784 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
2785 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
2786 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2787 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi)
2788 ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
2789 ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
2790 ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8)
2791 ; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9)
2792 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2793 ; AVX512F-SLOW-NEXT: vmovdqa %ymm11, (%rax)
2794 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2795 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax)
2796 ; AVX512F-SLOW-NEXT: vzeroupper
2797 ; AVX512F-SLOW-NEXT: retq
2799 ; AVX512F-FAST-LABEL: load_i16_stride7_vf16:
2800 ; AVX512F-FAST: # %bb.0:
2801 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
2802 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
2803 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
2804 ; AVX512F-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
2805 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15]
2806 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,9,u,13,u,u,u]
2807 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8
2808 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,9,u,12,u,u,u]
2809 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
2810 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
2811 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
2812 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
2813 ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1]
2814 ; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
2815 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
2816 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
2817 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2818 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5
2819 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
2820 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
2821 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2822 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4
2823 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
2824 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
2825 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
2826 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
2827 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
2828 ; AVX512F-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
2829 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
2830 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
2831 ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
2832 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
2833 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2834 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
2835 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2836 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2837 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2838 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
2839 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2840 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
2841 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2842 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
2843 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2844 ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
2845 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
2846 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
2847 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
2848 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2849 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
2850 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2851 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2852 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
2853 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
2854 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2855 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
2856 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2857 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
2858 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2859 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
2860 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2861 ; AVX512F-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8
2862 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15]
2863 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
2864 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
2865 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2866 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
2867 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2868 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2869 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
2870 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2871 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7]
2872 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
2873 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7
2874 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7]
2875 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
2876 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2877 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2878 ; AVX512F-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7
2879 ; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13
2880 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,10,14,u,u,u]
2881 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15]
2882 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7]
2883 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
2884 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
2885 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
2886 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
2887 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2888 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2889 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10
2890 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7]
2891 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
2892 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
2893 ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14
2894 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
2895 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
2896 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
2897 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
2898 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9
2899 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7]
2900 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15]
2901 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
2902 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u]
2903 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11
2904 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
2905 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
2906 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
2907 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
2908 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2909 ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
2910 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,4,8,11,15,u,u,u]
2911 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
2912 ; AVX512F-FAST-NEXT: # ymm15 = mem[0,1,0,1]
2913 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,4,7,11,14,u,u,u]
2914 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
2915 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
2916 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13
2917 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
2918 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
2919 ; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
2920 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
2921 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2922 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
2923 ; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1]
2924 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2
2925 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
2926 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
2927 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
2928 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
2929 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0
2930 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
2931 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
2932 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
2933 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
2934 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2935 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
2936 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2937 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2938 ; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rsi)
2939 ; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx)
2940 ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%rcx)
2941 ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%r8)
2942 ; AVX512F-FAST-NEXT: vmovdqa %ymm9, (%r9)
2943 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2944 ; AVX512F-FAST-NEXT: vmovdqa %ymm11, (%rax)
2945 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2946 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax)
2947 ; AVX512F-FAST-NEXT: vzeroupper
2948 ; AVX512F-FAST-NEXT: retq
2950 ; AVX512BW-LABEL: load_i16_stride7_vf16:
2951 ; AVX512BW: # %bb.0:
2952 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2953 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2954 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2955 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2956 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
2957 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
2958 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
2959 ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
2960 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
2961 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
2962 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
2963 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
2964 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
2965 ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
2966 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
2967 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
2968 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
2969 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
2970 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2971 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
2972 ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
2973 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
2974 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
2975 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
2976 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
2977 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
2978 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
2979 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
2980 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
2981 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
2982 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
2983 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2984 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2985 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
2986 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
2987 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
2988 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
2989 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
2990 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
2991 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2992 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
2993 ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
2994 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
2995 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
2996 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
2997 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2998 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2999 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
3000 ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
3001 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3002 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
3003 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3004 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
3005 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3006 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3007 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3008 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
3009 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
3010 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
3011 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r10)
3012 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
3013 ; AVX512BW-NEXT: vzeroupper
3014 ; AVX512BW-NEXT: retq
3015 %wide.vec = load <112 x i16>, ptr %in.vec, align 64
3016 %strided.vec0 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
3017 %strided.vec1 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
3018 %strided.vec2 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
3019 %strided.vec3 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
3020 %strided.vec4 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
3021 %strided.vec5 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
3022 %strided.vec6 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
3023 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
3024 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
3025 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
3026 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
3027 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
3028 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
3029 store <16 x i16> %strided.vec6, ptr %out.vec6, align 64
3033 define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
3034 ; SSE-LABEL: load_i16_stride7_vf32:
3036 ; SSE-NEXT: subq $600, %rsp # imm = 0x258
3037 ; SSE-NEXT: movdqa 304(%rdi), %xmm5
3038 ; SSE-NEXT: movdqa 288(%rdi), %xmm6
3039 ; SSE-NEXT: movdqa 112(%rdi), %xmm13
3040 ; SSE-NEXT: movdqa 128(%rdi), %xmm8
3041 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3042 ; SSE-NEXT: movaps 160(%rdi), %xmm7
3043 ; SSE-NEXT: movaps 144(%rdi), %xmm10
3044 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3045 ; SSE-NEXT: movdqa 192(%rdi), %xmm9
3046 ; SSE-NEXT: movdqa 176(%rdi), %xmm12
3047 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
3048 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
3049 ; SSE-NEXT: movdqa %xmm1, %xmm11
3050 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3051 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
3052 ; SSE-NEXT: movdqa %xmm2, %xmm1
3053 ; SSE-NEXT: pandn %xmm0, %xmm1
3054 ; SSE-NEXT: movdqa %xmm12, %xmm0
3055 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3056 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
3057 ; SSE-NEXT: pand %xmm2, %xmm0
3058 ; SSE-NEXT: por %xmm1, %xmm0
3059 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
3060 ; SSE-NEXT: movdqa %xmm15, %xmm1
3061 ; SSE-NEXT: pandn %xmm0, %xmm1
3062 ; SSE-NEXT: movaps %xmm10, %xmm0
3063 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2]
3064 ; SSE-NEXT: movaps %xmm7, %xmm10
3065 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3066 ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535]
3067 ; SSE-NEXT: movaps %xmm14, %xmm3
3068 ; SSE-NEXT: andnps %xmm0, %xmm3
3069 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
3070 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
3071 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3072 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
3073 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3074 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
3075 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3076 ; SSE-NEXT: pand %xmm14, %xmm4
3077 ; SSE-NEXT: por %xmm3, %xmm4
3078 ; SSE-NEXT: pand %xmm15, %xmm4
3079 ; SSE-NEXT: por %xmm1, %xmm4
3080 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3081 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3082 ; SSE-NEXT: movdqa %xmm2, %xmm1
3083 ; SSE-NEXT: pandn %xmm0, %xmm1
3084 ; SSE-NEXT: movdqa %xmm6, %xmm0
3085 ; SSE-NEXT: movdqa %xmm6, %xmm7
3086 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3087 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3088 ; SSE-NEXT: movdqa %xmm5, %xmm6
3089 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3090 ; SSE-NEXT: pand %xmm2, %xmm0
3091 ; SSE-NEXT: por %xmm1, %xmm0
3092 ; SSE-NEXT: movdqa %xmm15, %xmm1
3093 ; SSE-NEXT: pandn %xmm0, %xmm1
3094 ; SSE-NEXT: movaps 272(%rdi), %xmm3
3095 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3096 ; SSE-NEXT: movaps 256(%rdi), %xmm0
3097 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3098 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
3099 ; SSE-NEXT: movaps %xmm14, %xmm3
3100 ; SSE-NEXT: andnps %xmm0, %xmm3
3101 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
3102 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3103 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3104 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3105 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
3106 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3108 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3109 ; SSE-NEXT: pand %xmm14, %xmm4
3110 ; SSE-NEXT: por %xmm3, %xmm4
3111 ; SSE-NEXT: pand %xmm15, %xmm4
3112 ; SSE-NEXT: por %xmm1, %xmm4
3113 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3114 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
3115 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3116 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3117 ; SSE-NEXT: movdqa %xmm2, %xmm1
3118 ; SSE-NEXT: pandn %xmm0, %xmm1
3119 ; SSE-NEXT: movdqa 416(%rdi), %xmm3
3120 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3121 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
3122 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3123 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3124 ; SSE-NEXT: pand %xmm2, %xmm0
3125 ; SSE-NEXT: por %xmm1, %xmm0
3126 ; SSE-NEXT: movdqa %xmm15, %xmm1
3127 ; SSE-NEXT: pandn %xmm0, %xmm1
3128 ; SSE-NEXT: movaps 384(%rdi), %xmm3
3129 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3130 ; SSE-NEXT: movaps 368(%rdi), %xmm0
3131 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3132 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
3133 ; SSE-NEXT: movaps %xmm14, %xmm3
3134 ; SSE-NEXT: andnps %xmm0, %xmm3
3135 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
3136 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3137 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3138 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3139 ; SSE-NEXT: movdqa 352(%rdi), %xmm8
3140 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
3141 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3142 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3143 ; SSE-NEXT: pand %xmm14, %xmm4
3144 ; SSE-NEXT: por %xmm3, %xmm4
3145 ; SSE-NEXT: pand %xmm15, %xmm4
3146 ; SSE-NEXT: por %xmm1, %xmm4
3147 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3148 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
3149 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3150 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3151 ; SSE-NEXT: movdqa %xmm2, %xmm1
3152 ; SSE-NEXT: pandn %xmm0, %xmm1
3153 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
3154 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3155 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
3156 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3157 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
3158 ; SSE-NEXT: pand %xmm2, %xmm0
3159 ; SSE-NEXT: por %xmm1, %xmm0
3160 ; SSE-NEXT: movdqa %xmm15, %xmm1
3161 ; SSE-NEXT: pandn %xmm0, %xmm1
3162 ; SSE-NEXT: movaps 32(%rdi), %xmm0
3163 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3164 ; SSE-NEXT: movaps 48(%rdi), %xmm4
3165 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3166 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
3167 ; SSE-NEXT: movaps %xmm14, %xmm3
3168 ; SSE-NEXT: andnps %xmm0, %xmm3
3169 ; SSE-NEXT: movdqa (%rdi), %xmm0
3170 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3171 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3172 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3173 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3174 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3175 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3176 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3177 ; SSE-NEXT: pand %xmm14, %xmm4
3178 ; SSE-NEXT: por %xmm3, %xmm4
3179 ; SSE-NEXT: pand %xmm15, %xmm4
3180 ; SSE-NEXT: por %xmm1, %xmm4
3181 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3182 ; SSE-NEXT: movdqa %xmm11, %xmm0
3183 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3184 ; SSE-NEXT: movdqa %xmm2, %xmm1
3185 ; SSE-NEXT: pandn %xmm0, %xmm1
3186 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3187 ; SSE-NEXT: movdqa %xmm9, %xmm0
3188 ; SSE-NEXT: psrld $16, %xmm0
3189 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3190 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
3191 ; SSE-NEXT: pand %xmm2, %xmm12
3192 ; SSE-NEXT: por %xmm1, %xmm12
3193 ; SSE-NEXT: movdqa %xmm15, %xmm0
3194 ; SSE-NEXT: pandn %xmm12, %xmm0
3195 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
3196 ; SSE-NEXT: movdqa %xmm11, %xmm1
3197 ; SSE-NEXT: pandn %xmm13, %xmm1
3198 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3199 ; SSE-NEXT: movdqa %xmm13, %xmm4
3200 ; SSE-NEXT: pand %xmm11, %xmm4
3201 ; SSE-NEXT: por %xmm1, %xmm4
3202 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3203 ; SSE-NEXT: movdqa %xmm12, %xmm1
3204 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3205 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3206 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3207 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3208 ; SSE-NEXT: movdqa %xmm14, %xmm3
3209 ; SSE-NEXT: pandn %xmm1, %xmm3
3210 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3211 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3212 ; SSE-NEXT: pand %xmm14, %xmm1
3213 ; SSE-NEXT: por %xmm1, %xmm3
3214 ; SSE-NEXT: pand %xmm15, %xmm3
3215 ; SSE-NEXT: por %xmm0, %xmm3
3216 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3217 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3218 ; SSE-NEXT: movdqa %xmm5, %xmm0
3219 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3220 ; SSE-NEXT: movdqa %xmm2, %xmm1
3221 ; SSE-NEXT: pandn %xmm0, %xmm1
3222 ; SSE-NEXT: psrld $16, %xmm6
3223 ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3224 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
3225 ; SSE-NEXT: pand %xmm2, %xmm7
3226 ; SSE-NEXT: por %xmm1, %xmm7
3227 ; SSE-NEXT: movdqa %xmm15, %xmm0
3228 ; SSE-NEXT: pandn %xmm7, %xmm0
3229 ; SSE-NEXT: movdqa %xmm11, %xmm1
3230 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3232 ; SSE-NEXT: pand %xmm11, %xmm4
3233 ; SSE-NEXT: por %xmm1, %xmm4
3234 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3235 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3236 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3237 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3238 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3239 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3240 ; SSE-NEXT: movdqa %xmm14, %xmm3
3241 ; SSE-NEXT: pandn %xmm1, %xmm3
3242 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3243 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3244 ; SSE-NEXT: pand %xmm14, %xmm1
3245 ; SSE-NEXT: por %xmm1, %xmm3
3246 ; SSE-NEXT: pand %xmm15, %xmm3
3247 ; SSE-NEXT: por %xmm0, %xmm3
3248 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3249 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3250 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3251 ; SSE-NEXT: movdqa %xmm2, %xmm1
3252 ; SSE-NEXT: pandn %xmm0, %xmm1
3253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3254 ; SSE-NEXT: psrld $16, %xmm0
3255 ; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
3256 ; SSE-NEXT: movdqa %xmm7, %xmm4
3257 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3258 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3259 ; SSE-NEXT: pand %xmm2, %xmm4
3260 ; SSE-NEXT: por %xmm1, %xmm4
3261 ; SSE-NEXT: movdqa %xmm15, %xmm0
3262 ; SSE-NEXT: pandn %xmm4, %xmm0
3263 ; SSE-NEXT: movdqa %xmm11, %xmm1
3264 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3265 ; SSE-NEXT: movdqa %xmm8, %xmm4
3266 ; SSE-NEXT: pand %xmm11, %xmm4
3267 ; SSE-NEXT: por %xmm1, %xmm4
3268 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3269 ; SSE-NEXT: movdqa %xmm3, %xmm1
3270 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3271 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3272 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3273 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3274 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3275 ; SSE-NEXT: movdqa %xmm14, %xmm8
3276 ; SSE-NEXT: pandn %xmm1, %xmm8
3277 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3278 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3279 ; SSE-NEXT: pand %xmm14, %xmm1
3280 ; SSE-NEXT: por %xmm1, %xmm8
3281 ; SSE-NEXT: pand %xmm15, %xmm8
3282 ; SSE-NEXT: por %xmm0, %xmm8
3283 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3285 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3286 ; SSE-NEXT: movdqa %xmm2, %xmm1
3287 ; SSE-NEXT: pandn %xmm0, %xmm1
3288 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3289 ; SSE-NEXT: psrld $16, %xmm0
3290 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3291 ; SSE-NEXT: movdqa %xmm10, %xmm4
3292 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3293 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3294 ; SSE-NEXT: pand %xmm2, %xmm4
3295 ; SSE-NEXT: por %xmm1, %xmm4
3296 ; SSE-NEXT: movdqa %xmm15, %xmm0
3297 ; SSE-NEXT: pandn %xmm4, %xmm0
3298 ; SSE-NEXT: movdqa %xmm11, %xmm1
3299 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3300 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3301 ; SSE-NEXT: pand %xmm11, %xmm4
3302 ; SSE-NEXT: por %xmm1, %xmm4
3303 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3304 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3305 ; SSE-NEXT: pand %xmm14, %xmm1
3306 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3307 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3308 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3309 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
3310 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3311 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
3312 ; SSE-NEXT: pandn %xmm4, %xmm14
3313 ; SSE-NEXT: por %xmm1, %xmm14
3314 ; SSE-NEXT: pand %xmm15, %xmm14
3315 ; SSE-NEXT: por %xmm0, %xmm14
3316 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3317 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3318 ; SSE-NEXT: # xmm0 = mem[0,1,0,1]
3319 ; SSE-NEXT: movdqa %xmm2, %xmm1
3320 ; SSE-NEXT: pandn %xmm0, %xmm1
3321 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3322 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
3323 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3324 ; SSE-NEXT: pand %xmm2, %xmm0
3325 ; SSE-NEXT: por %xmm1, %xmm0
3326 ; SSE-NEXT: movdqa %xmm15, %xmm1
3327 ; SSE-NEXT: pandn %xmm0, %xmm1
3328 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3]
3329 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7]
3330 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3331 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
3332 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1]
3333 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3334 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3]
3335 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3]
3336 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3337 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3]
3338 ; SSE-NEXT: andps %xmm15, %xmm8
3339 ; SSE-NEXT: orps %xmm1, %xmm8
3340 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3341 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
3342 ; SSE-NEXT: movdqa %xmm2, %xmm4
3343 ; SSE-NEXT: pandn %xmm1, %xmm4
3344 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3346 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
3347 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3348 ; SSE-NEXT: pand %xmm2, %xmm1
3349 ; SSE-NEXT: por %xmm4, %xmm1
3350 ; SSE-NEXT: movdqa %xmm15, %xmm4
3351 ; SSE-NEXT: pandn %xmm1, %xmm4
3352 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
3354 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7]
3355 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3356 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
3357 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1]
3358 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3359 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3360 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3361 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
3362 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
3363 ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3]
3364 ; SSE-NEXT: andps %xmm15, %xmm14
3365 ; SSE-NEXT: orps %xmm4, %xmm14
3366 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3367 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3368 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
3369 ; SSE-NEXT: movdqa %xmm2, %xmm5
3370 ; SSE-NEXT: pandn %xmm4, %xmm5
3371 ; SSE-NEXT: movdqa %xmm7, %xmm4
3372 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3373 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
3374 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
3375 ; SSE-NEXT: pand %xmm2, %xmm4
3376 ; SSE-NEXT: por %xmm5, %xmm4
3377 ; SSE-NEXT: movdqa %xmm15, %xmm7
3378 ; SSE-NEXT: pandn %xmm4, %xmm7
3379 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
3380 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7]
3381 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3382 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
3383 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
3384 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3385 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3386 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
3387 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3]
3388 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3389 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
3390 ; SSE-NEXT: andps %xmm15, %xmm3
3391 ; SSE-NEXT: orps %xmm7, %xmm3
3392 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3393 ; SSE-NEXT: movdqa %xmm10, %xmm4
3394 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3395 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
3396 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
3397 ; SSE-NEXT: pand %xmm2, %xmm4
3398 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3399 ; SSE-NEXT: # xmm7 = mem[0,1,0,1]
3400 ; SSE-NEXT: pandn %xmm7, %xmm2
3401 ; SSE-NEXT: por %xmm4, %xmm2
3402 ; SSE-NEXT: movdqa %xmm15, %xmm4
3403 ; SSE-NEXT: pandn %xmm2, %xmm4
3404 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3405 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
3406 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
3407 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3408 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3]
3409 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
3410 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3411 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3412 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
3413 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3]
3414 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
3415 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3]
3416 ; SSE-NEXT: andps %xmm15, %xmm0
3417 ; SSE-NEXT: orps %xmm4, %xmm0
3418 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3419 ; SSE-NEXT: movdqa %xmm11, %xmm4
3420 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3421 ; SSE-NEXT: movdqa %xmm9, %xmm7
3422 ; SSE-NEXT: pand %xmm11, %xmm7
3423 ; SSE-NEXT: por %xmm4, %xmm7
3424 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7]
3425 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
3426 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7]
3427 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3428 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3]
3429 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3430 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3431 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3432 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3433 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7]
3434 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3435 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
3436 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
3437 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
3438 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
3439 ; SSE-NEXT: movdqa %xmm15, %xmm9
3440 ; SSE-NEXT: pandn %xmm0, %xmm9
3441 ; SSE-NEXT: andps %xmm15, %xmm4
3442 ; SSE-NEXT: por %xmm4, %xmm9
3443 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3444 ; SSE-NEXT: movdqa %xmm11, %xmm0
3445 ; SSE-NEXT: pandn %xmm8, %xmm0
3446 ; SSE-NEXT: movdqa %xmm12, %xmm4
3447 ; SSE-NEXT: pand %xmm11, %xmm4
3448 ; SSE-NEXT: por %xmm0, %xmm4
3449 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7]
3450 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3451 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
3452 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3453 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3454 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3455 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
3456 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3457 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3458 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3459 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
3460 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3461 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3462 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3463 ; SSE-NEXT: movdqa %xmm15, %xmm4
3464 ; SSE-NEXT: pandn %xmm1, %xmm4
3465 ; SSE-NEXT: andps %xmm15, %xmm0
3466 ; SSE-NEXT: por %xmm0, %xmm4
3467 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3468 ; SSE-NEXT: movdqa %xmm11, %xmm0
3469 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3470 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3471 ; SSE-NEXT: movdqa %xmm12, %xmm1
3472 ; SSE-NEXT: pand %xmm11, %xmm1
3473 ; SSE-NEXT: por %xmm0, %xmm1
3474 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
3475 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3476 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7]
3477 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3478 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3479 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
3480 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3481 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3482 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3483 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3484 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3485 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3486 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3487 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3488 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3489 ; SSE-NEXT: movdqa %xmm15, %xmm4
3490 ; SSE-NEXT: pandn %xmm1, %xmm4
3491 ; SSE-NEXT: andps %xmm15, %xmm0
3492 ; SSE-NEXT: por %xmm0, %xmm4
3493 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3494 ; SSE-NEXT: movdqa %xmm11, %xmm0
3495 ; SSE-NEXT: pandn %xmm3, %xmm0
3496 ; SSE-NEXT: movdqa %xmm14, %xmm1
3497 ; SSE-NEXT: pand %xmm11, %xmm1
3498 ; SSE-NEXT: por %xmm0, %xmm1
3499 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
3500 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3501 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7]
3502 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3503 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3504 ; SSE-NEXT: movdqa %xmm10, %xmm2
3505 ; SSE-NEXT: movdqa %xmm10, %xmm1
3506 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3507 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3508 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3509 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3510 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3511 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3512 ; SSE-NEXT: andps %xmm15, %xmm0
3513 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3514 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3515 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3516 ; SSE-NEXT: pandn %xmm1, %xmm15
3517 ; SSE-NEXT: por %xmm0, %xmm15
3518 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3519 ; SSE-NEXT: movdqa %xmm9, %xmm0
3520 ; SSE-NEXT: psrld $16, %xmm0
3521 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3522 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3523 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3524 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
3525 ; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
3526 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3527 ; SSE-NEXT: psrlq $48, %xmm1
3528 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3529 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3530 ; SSE-NEXT: movdqa %xmm4, %xmm1
3531 ; SSE-NEXT: psrlq $16, %xmm1
3532 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3533 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
3534 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3535 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3536 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3537 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3538 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3539 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
3540 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
3541 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3542 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3543 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3544 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3545 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3546 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3547 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3548 ; SSE-NEXT: movdqa %xmm1, %xmm0
3549 ; SSE-NEXT: psrld $16, %xmm0
3550 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3551 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
3552 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3553 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3554 ; SSE-NEXT: movdqa %xmm1, %xmm6
3555 ; SSE-NEXT: movdqa %xmm7, %xmm1
3556 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3557 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3558 ; SSE-NEXT: psrlq $48, %xmm1
3559 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3560 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3561 ; SSE-NEXT: movdqa %xmm14, %xmm1
3562 ; SSE-NEXT: psrlq $16, %xmm1
3563 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3564 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
3565 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3566 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3567 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3568 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3569 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3570 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
3571 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
3572 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3573 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3574 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3575 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3576 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3577 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3579 ; SSE-NEXT: movdqa %xmm1, %xmm7
3580 ; SSE-NEXT: psrld $16, %xmm7
3581 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3582 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
3583 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3584 ; SSE-NEXT: movdqa %xmm1, %xmm6
3585 ; SSE-NEXT: movdqa %xmm8, %xmm1
3586 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
3587 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3588 ; SSE-NEXT: psrlq $48, %xmm1
3589 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3590 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3591 ; SSE-NEXT: movdqa %xmm13, %xmm1
3592 ; SSE-NEXT: psrlq $16, %xmm1
3593 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
3594 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3595 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3596 ; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload
3597 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3598 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3599 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3600 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
3601 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3602 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3603 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3604 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3605 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3606 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3607 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3608 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3609 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3610 ; SSE-NEXT: movdqa %xmm1, %xmm6
3611 ; SSE-NEXT: psrld $16, %xmm6
3612 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3613 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
3614 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3615 ; SSE-NEXT: movdqa %xmm1, %xmm12
3616 ; SSE-NEXT: movdqa %xmm9, %xmm1
3617 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
3618 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3619 ; SSE-NEXT: psrlq $48, %xmm1
3620 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3621 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3622 ; SSE-NEXT: psrlq $16, %xmm1
3623 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3624 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3625 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3626 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3627 ; SSE-NEXT: movdqa %xmm2, %xmm12
3628 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
3629 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3630 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3631 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3632 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3633 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3634 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3635 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3636 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3637 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3638 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3639 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
3640 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
3641 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3642 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3643 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3644 ; SSE-NEXT: movdqa %xmm11, %xmm0
3645 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3646 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3647 ; SSE-NEXT: pand %xmm11, %xmm1
3648 ; SSE-NEXT: por %xmm0, %xmm1
3649 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
3650 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3651 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3652 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3653 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3654 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
3655 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3656 ; SSE-NEXT: movdqa %xmm14, %xmm0
3657 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
3658 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3659 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3660 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3661 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3662 ; SSE-NEXT: movdqa %xmm11, %xmm0
3663 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3664 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3665 ; SSE-NEXT: pand %xmm11, %xmm1
3666 ; SSE-NEXT: por %xmm0, %xmm1
3667 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
3668 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3669 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3670 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
3671 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3672 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2]
3673 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3674 ; SSE-NEXT: movdqa %xmm13, %xmm0
3675 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3676 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
3677 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3678 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3679 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
3680 ; SSE-NEXT: movdqa %xmm11, %xmm0
3681 ; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload
3682 ; SSE-NEXT: pand %xmm11, %xmm8
3683 ; SSE-NEXT: por %xmm0, %xmm8
3684 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3]
3685 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3686 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3687 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
3688 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3689 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
3690 ; SSE-NEXT: pand %xmm11, %xmm2
3691 ; SSE-NEXT: pandn %xmm12, %xmm11
3692 ; SSE-NEXT: por %xmm2, %xmm11
3693 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3694 ; SSE-NEXT: movdqa %xmm12, %xmm0
3695 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3696 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
3697 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3698 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3699 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
3700 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
3701 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3702 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3703 ; SSE-NEXT: movdqa %xmm3, %xmm11
3704 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3705 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
3706 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
3707 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3708 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3709 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3710 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
3711 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3712 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
3713 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3714 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3715 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3716 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
3717 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3718 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3719 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3720 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
3721 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
3722 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
3723 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3724 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
3725 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3726 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
3727 ; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3728 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
3729 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
3730 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3731 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
3732 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
3733 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3734 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3735 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
3736 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3737 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3738 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
3739 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
3740 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
3741 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3742 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
3743 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
3744 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3745 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3746 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3747 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3748 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
3749 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
3750 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3751 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3752 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
3753 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
3754 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
3755 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
3756 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
3757 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3758 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3759 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
3760 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
3761 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3762 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
3763 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
3764 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3765 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
3766 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
3767 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3768 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3769 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
3770 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3771 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3772 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
3773 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3774 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3775 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3776 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
3777 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3778 ; SSE-NEXT: movaps %xmm1, (%rsi)
3779 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3780 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
3781 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3782 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
3783 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3784 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
3785 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3786 ; SSE-NEXT: movaps %xmm1, (%rdx)
3787 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3788 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
3789 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3790 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
3791 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3792 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
3793 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3794 ; SSE-NEXT: movaps %xmm1, (%rcx)
3795 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3796 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
3797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3798 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
3799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3800 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3801 ; SSE-NEXT: movdqa %xmm15, (%r8)
3802 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3803 ; SSE-NEXT: movaps %xmm1, 48(%r8)
3804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3805 ; SSE-NEXT: movaps %xmm1, 32(%r8)
3806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3807 ; SSE-NEXT: movaps %xmm1, 16(%r8)
3808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3809 ; SSE-NEXT: movaps %xmm1, (%r9)
3810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3811 ; SSE-NEXT: movaps %xmm1, 48(%r9)
3812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3813 ; SSE-NEXT: movaps %xmm1, 32(%r9)
3814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3815 ; SSE-NEXT: movaps %xmm1, 16(%r9)
3816 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3817 ; SSE-NEXT: movaps %xmm6, (%rax)
3818 ; SSE-NEXT: movaps %xmm7, 48(%rax)
3819 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3820 ; SSE-NEXT: movaps %xmm1, 32(%rax)
3821 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3822 ; SSE-NEXT: movaps %xmm1, 16(%rax)
3823 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3824 ; SSE-NEXT: movapd %xmm0, (%rax)
3825 ; SSE-NEXT: movapd %xmm3, 48(%rax)
3826 ; SSE-NEXT: movapd %xmm4, 32(%rax)
3827 ; SSE-NEXT: movapd %xmm5, 16(%rax)
3828 ; SSE-NEXT: addq $600, %rsp # imm = 0x258
3831 ; AVX1-ONLY-LABEL: load_i16_stride7_vf32:
3832 ; AVX1-ONLY: # %bb.0:
3833 ; AVX1-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8
3834 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15
3835 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm0
3836 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8
3838 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3]
3839 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3840 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3841 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
3842 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3843 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
3844 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
3845 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
3846 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3847 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
3848 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1
3849 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3850 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2
3851 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3852 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3853 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3854 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3855 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
3856 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
3857 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3858 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
3859 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3860 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3861 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
3862 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3863 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
3864 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
3865 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3
3866 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3]
3867 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13
3868 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3869 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
3870 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
3871 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3
3872 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3873 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0
3874 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3875 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3876 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
3877 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
3878 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
3879 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6
3880 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7
3881 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero
3882 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3883 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3884 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
3885 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
3886 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
3887 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
3888 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
3889 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3890 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
3891 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3892 ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1
3893 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3894 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
3895 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2
3896 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3897 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
3898 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3899 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm9
3900 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm2
3901 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3902 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3
3903 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3904 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3905 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
3906 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3
3907 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3908 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2
3909 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3910 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3911 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3912 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
3913 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
3914 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2
3915 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3916 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
3917 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
3918 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm14
3919 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3]
3920 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3921 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3922 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5
3923 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11
3924 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero
3925 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3926 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3927 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
3928 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4
3929 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3930 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3
3931 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3932 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
3933 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4
3934 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3935 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
3936 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
3937 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4
3938 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3939 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
3940 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3941 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
3942 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
3943 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0
3944 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
3945 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3946 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
3947 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3948 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
3949 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
3950 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3951 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3952 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
3953 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7]
3954 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3955 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
3956 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
3957 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3958 ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1
3959 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3960 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
3961 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
3962 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3963 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
3964 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3965 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3966 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3967 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3968 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3969 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
3970 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2
3971 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3972 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3973 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
3974 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3975 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4
3976 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3977 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
3978 ; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7]
3979 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
3980 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
3981 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
3982 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
3983 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1
3984 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3
3985 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1
3986 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3987 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
3988 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
3989 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
3990 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
3991 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3992 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3993 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3994 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3995 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
3996 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3997 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3998 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7]
3999 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
4000 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
4001 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4002 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4003 ; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1
4004 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4005 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
4006 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4007 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
4008 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
4009 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4010 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
4011 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
4012 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
4013 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7]
4014 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4015 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4
4016 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4017 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4018 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4019 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4020 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
4021 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
4022 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4023 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm5
4024 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
4025 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4026 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4027 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4
4028 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1
4029 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4030 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4031 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
4032 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4033 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4034 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4035 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4036 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm0
4037 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4038 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4039 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3]
4040 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4041 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7]
4042 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4043 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4044 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4045 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
4046 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4047 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4048 ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2]
4049 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4050 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
4051 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4052 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1]
4053 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7]
4054 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4055 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
4056 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4057 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4058 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3]
4059 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4060 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4061 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3]
4062 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1]
4063 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4064 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4065 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
4066 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
4067 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
4068 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4069 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
4070 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5
4071 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
4072 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4073 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4074 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1
4075 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4
4076 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1
4077 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4078 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm1
4079 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4080 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4081 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3]
4082 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
4083 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4084 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
4085 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4086 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4087 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
4088 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
4089 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4090 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
4091 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4092 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4093 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
4094 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1]
4095 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
4096 ; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
4097 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3]
4098 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
4099 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
4100 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2]
4101 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7]
4102 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4103 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1]
4104 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
4105 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4106 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
4107 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
4108 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4109 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5
4110 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm3, %ymm14
4111 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5
4112 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4113 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4
4114 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5
4115 ; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm6
4116 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
4117 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4118 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4119 ; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
4120 ; AVX1-ONLY-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
4121 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
4122 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
4123 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4124 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
4125 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7]
4126 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
4127 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7]
4128 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7]
4129 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4130 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
4131 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
4132 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
4133 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
4134 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4135 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
4136 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4137 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7]
4138 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
4139 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
4140 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7]
4141 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4142 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
4143 ; AVX1-ONLY-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4144 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
4145 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
4146 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
4147 ; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
4148 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
4149 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15
4150 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4151 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm13
4152 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13
4153 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4154 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0
4155 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13
4156 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
4157 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13
4158 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13
4159 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0
4160 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
4161 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4162 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4163 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload
4164 ; AVX1-ONLY-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
4165 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4166 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
4167 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4168 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload
4169 ; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
4170 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7]
4171 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7]
4172 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7]
4173 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4174 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4175 ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4176 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7]
4177 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
4178 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
4179 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4180 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4181 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
4182 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
4183 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
4184 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7]
4185 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4186 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
4187 ; AVX1-ONLY-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
4188 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
4189 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
4190 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
4191 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5
4192 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm14
4193 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
4194 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1
4195 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm2
4196 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
4197 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2
4198 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4199 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4200 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
4201 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
4202 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4203 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4204 ; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1]
4205 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
4206 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7]
4207 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4208 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3]
4209 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
4210 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4211 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4212 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7]
4213 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
4214 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4215 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
4216 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4217 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1
4218 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4219 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
4220 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4221 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm4
4222 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4223 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
4224 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
4225 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4226 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3]
4227 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4228 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4229 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4230 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
4231 ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4232 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
4233 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
4234 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3]
4235 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
4236 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
4237 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4238 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4239 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4240 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4
4241 ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13
4242 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
4243 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4244 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
4245 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4246 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
4247 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4248 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
4249 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
4250 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
4251 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
4252 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4253 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4254 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4255 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4256 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2
4257 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4258 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
4259 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
4260 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4261 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm4
4262 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4263 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4264 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
4265 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4266 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
4267 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
4268 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4269 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4270 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4271 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
4272 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
4273 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4274 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
4275 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
4276 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
4277 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4278 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0
4279 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1
4280 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4281 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4282 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0
4283 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4284 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4285 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4286 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
4287 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
4288 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
4289 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
4290 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4291 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4292 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7]
4293 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4294 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4295 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4296 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1
4297 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4298 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4299 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4300 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
4301 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
4302 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
4303 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4304 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload
4305 ; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7]
4306 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4307 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7]
4308 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4309 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
4310 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4311 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
4312 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4313 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4314 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
4315 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
4316 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4317 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4318 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
4319 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4320 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4321 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4322 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4323 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
4324 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
4325 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2
4326 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4327 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4328 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4329 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
4330 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4331 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4332 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7]
4333 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4334 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
4335 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
4336 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2
4337 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
4338 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4339 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
4340 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10
4341 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
4342 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
4343 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
4344 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4345 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4346 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7]
4347 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4348 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
4349 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4350 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
4351 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
4352 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
4353 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4354 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4355 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
4356 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
4357 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4358 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4359 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1
4360 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2
4361 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
4362 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4363 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4364 ; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero
4365 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4366 ; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
4367 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4368 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
4369 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7]
4370 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4371 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
4372 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4373 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4374 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
4375 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4376 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4377 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4378 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4379 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4380 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
4381 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
4382 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
4383 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
4384 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4385 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
4386 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4387 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
4388 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3]
4389 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
4390 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
4391 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4392 ; AVX1-ONLY-NEXT: # xmm12 = mem[3,3,3,3]
4393 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5
4394 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
4395 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4396 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4397 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4398 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4
4399 ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11
4400 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
4401 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4402 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4403 ; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
4404 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4405 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4406 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4407 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
4408 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
4409 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
4410 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
4411 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
4412 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
4413 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4414 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4415 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4416 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
4417 ; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4418 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
4419 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
4420 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
4421 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
4422 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
4423 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3]
4424 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
4425 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
4426 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3]
4427 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
4428 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
4429 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4430 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4
4431 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm3
4432 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
4433 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4434 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi)
4435 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4436 ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi)
4437 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4438 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
4439 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4440 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
4441 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4442 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx)
4443 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4444 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx)
4445 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4446 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
4447 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4448 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
4449 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4450 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9)
4451 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4452 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
4453 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4454 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax)
4455 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4456 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax)
4457 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4458 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax)
4459 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax)
4460 ; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8
4461 ; AVX1-ONLY-NEXT: vzeroupper
4462 ; AVX1-ONLY-NEXT: retq
4464 ; AVX2-SLOW-LABEL: load_i16_stride7_vf32:
4465 ; AVX2-SLOW: # %bb.0:
4466 ; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208
4467 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10
4468 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9
4469 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5
4470 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13
4471 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
4472 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
4473 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11
4474 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
4475 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
4476 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8
4477 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
4478 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
4479 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
4480 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
4481 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6
4482 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7
4483 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
4484 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
4485 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
4486 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3
4487 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
4488 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1
4489 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4490 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3
4491 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
4492 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5
4493 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
4494 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
4495 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
4496 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
4497 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
4498 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
4499 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4500 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4501 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
4502 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
4503 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
4504 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4505 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
4506 ; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1
4507 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
4508 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
4509 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
4510 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
4511 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
4512 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4513 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
4514 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
4515 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
4516 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4517 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
4518 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
4519 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
4520 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
4521 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
4522 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4523 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1]
4524 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
4525 ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4526 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4527 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4528 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
4529 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4530 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
4531 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
4532 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
4533 ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0
4534 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
4535 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
4536 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13
4537 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7]
4538 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4
4539 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
4540 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1
4541 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4542 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4543 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
4544 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
4545 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3
4546 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4547 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
4548 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
4549 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14
4550 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4551 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
4552 ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2
4553 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10
4554 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
4555 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15
4556 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4557 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4558 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
4559 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
4560 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4561 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
4562 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
4563 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
4564 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4565 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9
4566 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
4567 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4568 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
4569 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
4570 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4571 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
4572 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
4573 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4574 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8
4575 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7
4576 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2
4577 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7]
4578 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11
4579 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
4580 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
4581 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4582 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
4583 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4584 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4585 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5
4586 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2]
4587 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5]
4588 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4589 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
4590 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
4591 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
4592 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4593 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6
4594 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4
4595 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
4596 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7]
4597 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
4598 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4599 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
4600 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4601 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4602 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3
4603 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2]
4604 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5]
4605 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4606 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
4607 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4608 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4609 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4610 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7]
4611 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14
4612 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7]
4613 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
4614 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1
4615 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4616 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
4617 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
4618 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4619 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4620 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4621 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4622 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4623 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7]
4624 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4625 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
4626 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0
4627 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4628 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
4629 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
4630 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4631 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4632 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4633 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4634 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4635 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
4636 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4637 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
4638 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
4639 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4640 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4641 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2]
4642 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
4643 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
4644 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
4645 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
4646 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
4647 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4648 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
4649 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4650 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
4651 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4652 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4653 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,2]
4654 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
4655 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
4656 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4657 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
4658 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
4659 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4660 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
4661 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4662 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
4663 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
4664 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4665 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4666 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3]
4667 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4]
4668 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
4669 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4670 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
4671 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4672 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4673 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
4674 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4675 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
4676 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
4677 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4678 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4679 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3]
4680 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
4681 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
4682 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
4683 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
4684 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
4685 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4686 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7]
4687 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
4688 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4689 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
4690 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
4691 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
4692 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
4693 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4694 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5]
4695 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4696 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
4697 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
4698 ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
4699 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5
4700 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
4701 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
4702 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
4703 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
4704 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4705 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
4706 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7]
4707 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
4708 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
4709 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
4710 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
4711 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15]
4712 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
4713 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4714 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4715 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
4716 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4717 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
4718 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
4719 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4720 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
4721 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4722 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4723 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4724 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4725 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
4726 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5
4727 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
4728 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
4729 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
4730 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
4731 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
4732 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
4733 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
4734 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4735 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
4736 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4737 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4738 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
4739 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4740 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4741 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
4742 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4743 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
4744 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
4745 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4746 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
4747 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
4748 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
4749 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
4750 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3
4751 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0
4752 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
4753 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
4754 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
4755 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4756 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
4757 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7]
4758 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
4759 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
4760 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
4761 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
4762 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
4763 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
4764 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
4765 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1
4766 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
4767 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
4768 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4769 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4770 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
4771 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15
4772 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7
4773 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
4774 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
4775 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
4776 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11
4777 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1
4778 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
4779 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
4780 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6]
4781 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
4782 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15]
4783 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4784 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7]
4785 ; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13
4786 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
4787 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
4788 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8
4789 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4790 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4791 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
4792 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14
4793 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
4794 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
4795 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
4796 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4797 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
4798 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15]
4799 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
4800 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
4801 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
4802 ; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7]
4803 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
4804 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15]
4805 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
4806 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6
4807 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
4808 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
4809 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
4810 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
4811 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
4812 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4813 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4814 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4815 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
4816 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
4817 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4818 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
4819 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
4820 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
4821 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4822 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
4823 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
4824 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
4825 ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8
4826 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
4827 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
4828 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4829 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
4830 ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
4831 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8
4832 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
4833 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
4834 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7]
4835 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4836 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4837 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4838 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4839 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
4840 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7]
4841 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
4842 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
4843 ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3
4844 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
4845 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
4846 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
4847 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4848 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
4849 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
4850 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
4851 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
4852 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
4853 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4854 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi)
4855 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4856 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi)
4857 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4858 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx)
4859 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4860 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx)
4861 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4862 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx)
4863 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4864 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx)
4865 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4866 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8)
4867 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4868 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8)
4869 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4870 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9)
4871 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4872 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9)
4873 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4874 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4875 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax)
4876 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax)
4877 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4878 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax)
4879 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax)
4880 ; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208
4881 ; AVX2-SLOW-NEXT: vzeroupper
4882 ; AVX2-SLOW-NEXT: retq
4884 ; AVX2-FAST-LABEL: load_i16_stride7_vf32:
4885 ; AVX2-FAST: # %bb.0:
4886 ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288
4887 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7
4888 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8
4889 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5
4890 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6
4891 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
4892 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
4893 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
4894 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2
4895 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
4896 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13
4897 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm14
4898 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4899 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
4900 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2
4901 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
4902 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11
4903 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12
4904 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
4905 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
4906 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
4907 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4
4908 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
4909 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15
4910 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
4911 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9
4912 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10
4913 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
4914 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
4915 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
4916 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6
4917 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
4918 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
4919 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
4920 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
4921 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4922 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
4923 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,1,u,4,u,u,u]
4924 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
4925 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
4926 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
4927 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
4928 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
4929 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
4930 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
4931 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4
4932 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
4933 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4934 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7
4935 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
4936 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
4937 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
4938 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
4939 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
4940 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2
4941 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4942 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
4943 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4944 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
4945 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4946 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4947 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
4948 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4949 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4950 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
4951 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
4952 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,6,1,u,5,u,u,u]
4953 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1
4954 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
4955 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1
4956 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
4957 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
4958 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9
4959 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
4960 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4961 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4962 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
4963 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
4964 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
4965 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4966 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5
4967 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4968 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
4969 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
4970 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
4971 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6
4972 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
4973 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
4974 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
4975 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
4976 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4977 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
4978 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4979 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
4980 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
4981 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
4982 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4983 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
4984 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
4985 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
4986 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
4987 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
4988 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4989 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
4990 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8
4991 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0
4992 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1
4993 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
4994 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5
4995 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7
4996 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5]
4997 ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
4998 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2
4999 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
5000 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12
5001 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4
5002 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2]
5003 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5004 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11
5005 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
5006 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
5007 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7]
5008 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5009 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2
5010 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3
5011 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
5012 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12
5013 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11
5014 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13
5015 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1
5016 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2
5017 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2]
5018 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3
5019 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15
5020 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
5021 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5022 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
5023 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5024 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10
5025 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
5026 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15
5027 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7]
5028 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
5029 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1
5030 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5031 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
5032 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5033 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5034 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5035 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5036 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5037 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5038 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7]
5039 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5040 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
5041 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0
5042 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1
5043 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5044 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5045 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5046 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5047 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5048 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5049 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
5050 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7
5051 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5052 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
5053 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
5054 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5055 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5056 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
5057 ; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
5058 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14
5059 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5060 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
5061 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
5062 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5063 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5064 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
5065 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14
5066 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9
5067 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7]
5068 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5069 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5070 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5071 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1
5072 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5073 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5074 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
5075 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5076 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5077 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
5078 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5079 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
5080 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
5081 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5082 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5083 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3]
5084 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
5085 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9
5086 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
5087 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5088 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
5089 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5090 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5091 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
5092 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12
5093 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4
5094 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
5095 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5096 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5097 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3]
5098 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1
5099 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5100 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
5101 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
5102 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5103 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5104 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5105 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
5106 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5107 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
5108 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5109 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5110 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7]
5111 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5112 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5113 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
5114 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7]
5115 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5116 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5117 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
5118 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
5119 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5120 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5121 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7]
5122 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u]
5123 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6
5124 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
5125 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6
5126 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
5127 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
5128 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5129 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5130 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5131 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5132 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
5133 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5134 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
5135 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5136 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5137 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5138 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5139 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
5140 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2
5141 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
5142 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9
5143 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
5144 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5145 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
5146 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5147 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
5148 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
5149 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
5150 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
5151 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5152 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5153 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
5154 ; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
5155 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
5156 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
5157 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4]
5158 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
5159 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
5160 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
5161 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
5162 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
5163 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15
5164 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5165 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
5166 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5167 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
5168 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
5169 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9
5170 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7
5171 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u]
5172 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15
5173 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
5174 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15
5175 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15]
5176 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3]
5177 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
5178 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5179 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5180 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
5181 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1
5182 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
5183 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11
5184 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12
5185 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
5186 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8
5187 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
5188 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15]
5189 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
5190 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2
5191 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
5192 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5193 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5194 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
5195 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
5196 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5197 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5198 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
5199 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
5200 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
5201 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5202 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5203 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5204 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5205 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
5206 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
5207 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
5208 ; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload
5209 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
5210 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3
5211 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5]
5212 ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
5213 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0
5214 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
5215 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
5216 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
5217 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
5218 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5219 ; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
5220 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
5221 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
5222 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
5223 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9
5224 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
5225 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
5226 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,4,0,3,7,u,u,u]
5227 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3
5228 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
5229 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3
5230 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
5231 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
5232 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5233 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2
5234 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2
5235 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
5236 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
5237 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
5238 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
5239 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
5240 ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7]
5241 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3
5242 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3
5243 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
5244 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
5245 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5
5246 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
5247 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
5248 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
5249 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5250 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5251 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5252 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi)
5253 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5254 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
5255 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5256 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx)
5257 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5258 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx)
5259 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5260 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx)
5261 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5262 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx)
5263 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5264 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8)
5265 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5266 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8)
5267 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5268 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9)
5269 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5270 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9)
5271 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
5272 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5273 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax)
5274 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5275 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax)
5276 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
5277 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax)
5278 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
5279 ; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288
5280 ; AVX2-FAST-NEXT: vzeroupper
5281 ; AVX2-FAST-NEXT: retq
5283 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32:
5284 ; AVX2-FAST-PERLANE: # %bb.0:
5285 ; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228
5286 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11
5287 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10
5288 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5
5289 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12
5290 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
5291 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
5292 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6
5293 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1
5294 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
5295 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9
5296 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
5297 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
5298 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2
5299 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
5300 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7
5301 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8
5302 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
5303 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
5304 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
5305 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3
5306 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
5307 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1
5308 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5309 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1
5310 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7]
5311 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5
5312 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
5313 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5314 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
5315 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
5316 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0
5317 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5318 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5319 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
5320 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5321 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
5322 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
5323 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5324 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7]
5325 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
5326 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
5327 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
5328 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5329 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0
5330 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5331 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
5332 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
5333 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
5334 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5335 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
5336 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
5337 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
5338 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5339 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5340 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5341 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1]
5342 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
5343 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5344 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
5346 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
5347 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5348 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3
5349 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
5350 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
5351 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
5352 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0
5353 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
5354 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5355 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5356 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5357 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7]
5358 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5359 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5360 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5361 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5362 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5363 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13
5364 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5365 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
5366 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1]
5367 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5368 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
5369 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5370 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7
5371 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
5372 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15
5373 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5374 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5375 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5376 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
5377 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5378 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
5379 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5380 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
5381 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5382 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10
5383 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
5384 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5385 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5386 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5387 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
5388 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5389 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5390 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8
5391 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6
5392 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2
5393 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7]
5394 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11
5395 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
5396 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
5397 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
5398 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
5399 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5400 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
5401 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5
5402 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2]
5403 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5404 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4
5405 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5406 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
5407 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
5408 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5409 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9
5410 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4
5411 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
5412 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14
5413 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3
5414 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
5415 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
5416 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
5417 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1
5418 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2]
5419 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13
5420 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
5421 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
5422 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7]
5423 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5424 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7]
5425 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13
5426 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7]
5427 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
5428 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3
5429 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5430 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
5431 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14
5432 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
5433 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5434 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15]
5435 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7]
5436 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5437 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
5438 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14
5439 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
5440 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3
5441 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5442 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
5443 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5444 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5445 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
5446 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5447 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5448 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
5449 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5450 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
5451 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
5452 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5453 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5454 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2]
5455 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
5456 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3
5457 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
5458 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5459 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
5460 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5461 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5462 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7]
5463 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3
5464 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
5465 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5466 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2]
5467 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2
5468 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5469 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
5470 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
5471 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
5472 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5473 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7]
5474 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5475 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5476 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
5477 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5478 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5479 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3]
5480 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
5481 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5
5482 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
5483 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
5484 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5485 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5486 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7]
5487 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5
5488 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
5489 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5490 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
5491 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3]
5492 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1
5493 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5494 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
5495 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
5496 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
5498 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
5499 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5500 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3
5501 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14
5502 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
5503 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5504 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5505 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5506 ; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
5507 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7]
5508 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
5509 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8
5510 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
5511 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15]
5512 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5513 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7]
5514 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5515 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5516 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10
5517 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
5518 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5519 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3
5520 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5521 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5522 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7
5523 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
5524 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15]
5525 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5526 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5527 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5528 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5529 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
5530 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3
5531 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
5532 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5533 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5534 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5535 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5536 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7]
5537 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5
5538 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
5539 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
5540 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5541 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
5542 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7
5543 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
5544 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3
5545 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0
5546 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5547 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5548 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
5549 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5550 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5551 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7]
5552 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5553 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15]
5554 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
5555 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3
5556 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7]
5557 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
5558 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5559 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
5560 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3
5561 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0
5562 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
5563 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
5564 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10
5565 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5566 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
5567 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
5568 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14
5569 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5570 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
5571 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5572 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
5573 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
5574 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2
5575 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
5576 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3]
5577 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
5578 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5579 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7]
5580 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10
5581 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
5582 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10
5583 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15
5584 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12
5585 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
5586 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5
5587 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5588 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
5589 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15]
5590 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5591 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5592 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
5593 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
5594 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15]
5595 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10
5596 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5597 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5598 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
5599 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14
5600 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
5601 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5602 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
5603 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
5604 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
5605 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5606 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5607 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
5608 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7]
5609 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
5610 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15]
5611 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
5612 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6
5613 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
5614 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
5615 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
5616 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
5617 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5618 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
5619 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
5620 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5621 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
5622 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5623 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5624 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
5625 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10
5626 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
5627 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10
5628 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
5629 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
5630 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
5631 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5
5632 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
5633 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
5634 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5635 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5636 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5637 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
5638 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
5639 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
5640 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3
5641 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
5642 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4
5643 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5644 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5645 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
5646 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7]
5647 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
5648 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
5649 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4
5650 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7]
5651 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
5652 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6
5653 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
5654 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
5655 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
5656 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5657 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5658 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5659 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi)
5660 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
5661 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi)
5662 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5663 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx)
5664 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5665 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx)
5666 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5667 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx)
5668 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5669 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx)
5670 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5671 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8)
5672 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5673 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8)
5674 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5675 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9)
5676 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5677 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9)
5678 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5679 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5680 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax)
5681 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax)
5682 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5683 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax)
5684 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax)
5685 ; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228
5686 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
5687 ; AVX2-FAST-PERLANE-NEXT: retq
5689 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf32:
5690 ; AVX512F-ONLY-SLOW: # %bb.0:
5691 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2
5692 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
5693 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12
5694 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11
5695 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
5696 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5697 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5698 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
5699 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm4
5700 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm8
5701 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5702 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
5703 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
5704 ; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21
5705 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
5706 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10
5707 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
5708 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
5709 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
5710 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5711 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5712 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5713 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
5714 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
5715 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
5716 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5]
5717 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5718 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
5719 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3
5720 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6
5721 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3]
5722 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, %xmm13
5723 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
5724 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
5725 ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
5726 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
5727 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm21 {%k1}
5728 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5
5729 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6
5730 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14
5731 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
5732 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
5733 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
5734 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5735 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
5736 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
5737 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
5738 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5739 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
5740 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5741 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
5742 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
5743 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
5744 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
5745 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5746 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
5747 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
5748 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5749 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
5750 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5751 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
5752 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
5753 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
5754 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5755 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
5756 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5757 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6,7]
5758 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
5759 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
5760 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5761 ; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19
5762 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
5763 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5764 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
5765 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
5766 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5767 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28
5768 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5769 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
5770 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
5771 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5772 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5773 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
5774 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
5775 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27
5776 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
5777 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm23
5778 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
5779 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
5780 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5781 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3,4],ymm8[5],ymm4[6,7]
5782 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25
5783 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26
5784 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
5785 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
5786 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5787 ; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm20
5788 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,1,3]
5789 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
5790 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
5791 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
5792 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
5793 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5794 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[0,1,2,1,4,5,6,5]
5795 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5796 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5797 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1
5798 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm14, %xmm2
5799 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
5800 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18
5801 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
5802 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
5803 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
5804 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
5805 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5806 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2
5807 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1
5808 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
5809 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
5810 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
5811 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5812 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
5813 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm13, %xmm3
5814 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm13, %xmm31
5815 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
5816 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16
5817 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm17
5818 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
5819 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
5820 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7]
5821 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15
5822 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4
5823 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm15[3],ymm4[4,5],ymm15[6],ymm4[7]
5824 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
5825 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6],ymm8[7,8,9,10,11,12,13],ymm13[14],ymm8[15]
5826 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
5827 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
5828 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
5829 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
5830 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
5831 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8
5832 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13
5833 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
5834 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7
5835 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
5836 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5837 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
5838 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
5839 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5840 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
5841 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29
5842 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
5843 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
5844 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
5845 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7]
5846 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
5847 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6],ymm7[7,8],ymm3[9,10,11,12,13,14],ymm7[15]
5848 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
5849 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
5850 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7]
5851 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5852 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7]
5853 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
5854 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
5855 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
5856 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
5857 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
5858 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
5859 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5860 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
5861 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
5862 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7]
5863 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
5864 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
5865 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6,7,8],ymm3[9],ymm0[10,11,12,13,14,15]
5866 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
5867 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
5868 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7]
5869 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm7
5870 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3
5871 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
5872 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7]
5873 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7
5874 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
5875 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
5876 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7]
5877 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
5878 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5879 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
5880 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28
5881 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
5882 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
5883 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
5884 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
5885 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
5886 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5887 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
5888 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
5889 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
5890 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3
5891 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
5892 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
5893 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm27
5894 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
5895 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
5896 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
5897 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7]
5898 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14
5899 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
5900 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
5901 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7]
5902 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0]
5903 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5904 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0
5905 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
5906 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
5907 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5908 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
5909 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5910 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24
5911 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
5912 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9
5913 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
5914 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
5915 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
5916 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5917 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5918 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
5919 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5920 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
5921 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1
5922 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2
5923 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
5924 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
5925 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
5926 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16
5927 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
5928 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5929 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
5930 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
5931 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3
5932 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7
5933 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
5934 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5935 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5936 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
5937 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
5938 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
5939 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
5940 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
5941 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
5942 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
5943 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
5944 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,1]
5945 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3],ymm2[4,5,6,7,8,9,10],ymm10[11],ymm2[12,13,14,15]
5946 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
5947 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14
5948 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3,4,5],xmm14[6],xmm10[7]
5949 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
5950 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5951 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5952 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
5953 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
5954 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm10, %ymm2
5955 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5956 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
5957 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
5958 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5959 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
5960 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm2[2,3,0,1]
5961 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7,8,9,10,11],ymm14[12],ymm2[13,14,15]
5962 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
5963 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9
5964 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7]
5965 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
5966 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5967 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5968 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
5969 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
5970 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm9, %ymm2
5971 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5972 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
5973 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
5974 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6],ymm9[7,8],ymm2[9,10,11,12,13,14],ymm9[15]
5975 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
5976 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm14
5977 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
5978 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
5979 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
5980 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
5981 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
5982 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
5983 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
5984 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
5985 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
5986 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6,7,8],ymm11[9],ymm9[10,11,12,13,14,15]
5987 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm11
5988 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm12
5989 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm14
5990 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm25
5991 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
5992 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26
5993 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7]
5994 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,4,6,7]
5995 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3
5996 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
5997 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
5998 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
5999 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
6000 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3,4,5,6,7]
6001 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7]
6002 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8
6003 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4,5,6,7]
6004 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
6005 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
6006 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
6007 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6008 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
6009 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
6010 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
6011 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,3,1]
6012 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6013 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7]
6014 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6015 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6016 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
6017 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6018 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6019 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6020 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm4, %zmm11
6021 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
6022 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
6023 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm12
6024 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6025 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm4, %zmm23
6026 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
6027 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
6028 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm23 {%k1}
6029 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm27
6030 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1}
6031 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm1
6032 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1}
6033 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rsi)
6034 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx)
6035 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx)
6036 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8)
6037 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9)
6038 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6039 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm4, %zmm2
6040 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1}
6041 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax)
6042 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6043 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm4, %zmm3
6044 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
6045 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax)
6046 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper
6047 ; AVX512F-ONLY-SLOW-NEXT: retq
6049 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32:
6050 ; AVX512F-ONLY-FAST: # %bb.0:
6051 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm30
6052 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31
6053 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
6054 ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
6055 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25
6056 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
6057 ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
6058 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,3,6,15,12,13,6,15]
6059 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
6060 ; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
6061 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
6062 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u]
6063 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm10
6064 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm1, %zmm12
6065 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,u,u,u,4,8,11,15]
6066 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm15
6067 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u]
6068 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm3
6069 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm16, %zmm7
6070 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,u,u,u,4,7,11,14]
6071 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm2
6072 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15]
6073 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm0
6074 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm4
6075 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm28
6076 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,0,2]
6077 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6078 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm5
6079 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23
6080 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
6081 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm5[7]
6082 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6083 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4
6084 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
6085 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
6086 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
6087 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
6088 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u]
6089 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm8, %ymm22
6090 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm0
6091 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13
6092 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
6093 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
6094 ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0
6095 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
6096 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm22 {%k1}
6097 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm6
6098 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm8
6099 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
6100 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9
6101 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
6102 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
6103 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
6104 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6105 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6106 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
6107 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7
6108 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
6109 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24
6110 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
6111 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6112 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7]
6113 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6114 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
6115 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6116 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm0, %ymm20
6117 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm3
6118 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
6119 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7]
6120 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6121 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
6122 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm7
6123 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6124 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6125 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
6126 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
6127 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm13[1],xmm7[2,3,4,5,6,7]
6128 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1
6129 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27
6130 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
6131 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6132 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
6133 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6134 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
6135 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
6136 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6137 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
6138 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6139 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
6140 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6141 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
6142 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10
6143 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7]
6144 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6145 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21
6146 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7]
6147 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6148 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
6149 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
6150 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6151 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6152 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5]
6153 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm10, %ymm10
6154 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6155 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm10[7]
6156 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
6157 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6158 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm15
6159 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2],ymm6[3,4,5],ymm8[6],ymm6[7]
6160 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
6161 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
6162 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm18, %zmm11
6163 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,u,u,u,5,8,12,15]
6164 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
6165 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm12, %zmm1
6166 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
6167 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0
6168 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm19, %zmm1
6169 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
6170 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6171 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17
6172 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6173 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
6174 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
6175 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
6176 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6177 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm18
6178 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7]
6179 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6180 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
6181 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
6182 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6183 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,1,3]
6184 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
6185 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6186 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
6187 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
6188 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm14
6189 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2
6190 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
6191 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6192 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
6193 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7]
6194 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10
6195 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7]
6196 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
6197 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
6198 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6199 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm10
6200 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6201 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
6202 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6203 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1
6204 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6205 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,7,10,14,u,u,u]
6206 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7]
6207 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
6208 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
6209 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm0
6210 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6211 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6212 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3
6213 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3
6214 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
6215 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3
6216 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm7, %xmm9
6217 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
6218 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
6219 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm23
6220 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
6221 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm1
6222 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3
6223 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
6224 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
6225 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
6226 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6227 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6228 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm16, %zmm1
6229 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6230 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm23
6231 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,u,0,3,7,u]
6232 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm0
6233 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
6234 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
6235 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
6236 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm13, %xmm1
6237 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
6238 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29
6239 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7]
6240 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6241 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
6242 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5,6,7]
6243 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
6244 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
6245 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6246 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6247 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,u,u,u,6,9,13,u]
6248 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3
6249 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6250 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
6251 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6252 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
6253 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6254 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
6255 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6256 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
6257 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26
6258 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,9,13,2,6,9,13]
6259 ; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1]
6260 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm3, %zmm3
6261 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,4,7,11,14,u,u,u]
6262 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm17
6263 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm12, %zmm12
6264 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
6265 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6266 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
6267 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm1[2,3,4,5,6,7]
6268 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6,7]
6269 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6270 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
6271 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
6272 ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
6273 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm1, %ymm1
6274 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
6275 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
6276 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
6277 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
6278 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6279 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm7
6280 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
6281 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
6282 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
6283 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,u,u,u,6,10,13,u]
6284 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
6285 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3
6286 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6287 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6288 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6289 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
6290 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6291 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
6292 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6293 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,4,8,11,15,u,u,u]
6294 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm3
6295 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm1, %zmm1
6296 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6297 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
6298 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
6299 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
6300 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
6301 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
6302 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
6303 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7]
6304 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
6305 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
6306 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
6307 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
6308 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
6309 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
6310 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6311 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,10,3,14,7,10,3]
6312 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6313 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm5
6314 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6315 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4,5,6,7]
6316 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6317 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6318 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
6319 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6320 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
6321 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6322 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm24
6323 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm27
6324 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm27, %zmm4, %zmm26
6325 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm16, %zmm15
6326 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
6327 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
6328 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1}
6329 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm18, %zmm16, %zmm19
6330 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm19 {%k1}
6331 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm23 {%k1}
6332 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm29, %zmm16, %zmm12
6333 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1}
6334 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rsi)
6335 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rdx)
6336 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%rcx)
6337 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8)
6338 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%r9)
6339 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
6340 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax)
6341 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm1
6342 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
6343 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
6344 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
6345 ; AVX512F-ONLY-FAST-NEXT: vzeroupper
6346 ; AVX512F-ONLY-FAST-NEXT: retq
6348 ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf32:
6349 ; AVX512DQ-SLOW: # %bb.0:
6350 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2
6351 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
6352 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
6353 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10
6354 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7]
6355 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm4
6356 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6357 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6358 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
6359 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm12
6360 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11
6361 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6362 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
6363 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
6364 ; AVX512DQ-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm18
6365 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8
6366 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9
6367 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
6368 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
6369 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
6370 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
6371 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
6372 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6373 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6374 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
6375 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
6376 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5]
6377 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6378 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6379 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3
6380 ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6
6381 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3]
6382 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm13
6383 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
6384 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6385 ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
6386 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
6387 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm18 {%k1}
6388 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6
6389 ; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm5
6390 ; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14
6391 ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
6392 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7]
6393 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
6394 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6395 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
6396 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
6397 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
6398 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6399 ; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
6400 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6401 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7]
6402 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6403 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6404 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6405 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6406 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
6407 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
6408 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6409 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
6410 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6411 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
6412 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21
6413 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
6414 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6415 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
6416 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6417 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
6418 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6419 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6420 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6421 ; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19
6422 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
6423 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6424 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
6425 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
6426 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
6427 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25
6428 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6429 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
6430 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
6431 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6432 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6433 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
6434 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
6435 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26
6436 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
6437 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
6438 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
6439 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
6440 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6441 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7]
6442 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm15
6443 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6444 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
6445 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6446 ; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm17
6447 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3]
6448 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
6449 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6450 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
6451 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6452 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6453 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5]
6454 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6455 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6456 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1
6457 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm14, %xmm2
6458 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6459 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20
6460 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
6461 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6462 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
6463 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
6464 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6465 ; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2
6466 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
6467 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm2[6],xmm7[7]
6468 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29
6469 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
6470 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
6471 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6472 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6473 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm13, %xmm1
6474 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm27
6475 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
6476 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm14, %xmm28
6477 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24
6478 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm12
6479 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
6480 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6481 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
6482 ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1
6483 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14
6484 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7]
6485 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6486 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
6487 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
6488 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
6489 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
6490 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6491 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6492 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5
6493 ; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13
6494 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7]
6495 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6496 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
6497 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6498 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
6499 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6500 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6501 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6502 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
6503 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7]
6504 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6505 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
6506 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
6507 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6508 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15]
6509 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6510 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
6511 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6512 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6513 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
6514 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6515 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
6516 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
6517 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6518 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
6519 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6520 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6521 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6522 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31
6523 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
6524 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6525 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
6526 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
6527 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7]
6528 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6529 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6530 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3
6531 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6532 ; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6533 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
6534 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
6535 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
6536 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6537 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
6538 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
6539 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6540 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
6541 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6542 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm22
6543 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
6544 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
6545 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1}
6546 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
6547 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6548 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6549 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
6550 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
6551 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6552 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4]
6553 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
6554 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
6555 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2
6556 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
6557 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
6558 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
6559 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7]
6560 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6561 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
6562 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7]
6563 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6564 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6565 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6566 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7]
6567 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0]
6568 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6569 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0
6570 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
6571 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
6572 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6573 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6574 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6575 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm19
6576 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
6577 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
6578 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6579 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
6580 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
6581 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23
6582 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
6583 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26
6584 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6585 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
6586 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6587 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
6588 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6589 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3,4,5,6,7]
6590 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7]
6591 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6592 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
6593 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
6594 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
6595 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
6596 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm0
6597 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
6598 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12
6599 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3,4,5],xmm12[6],xmm15[7]
6600 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6601 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6602 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6603 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6604 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6605 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm12, %ymm3
6606 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6607 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11
6608 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
6609 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
6610 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6611 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6612 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
6613 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1]
6614 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7,8,9,10,11],ymm12[12],ymm3[13,14,15]
6615 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
6616 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20
6617 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15
6618 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7]
6619 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6620 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6621 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6622 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6623 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6624 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm12, %ymm3
6625 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6626 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17
6627 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
6628 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3
6629 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
6630 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15]
6631 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6632 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
6633 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
6634 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8
6635 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm9
6636 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0
6637 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
6638 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
6639 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6640 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
6641 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
6642 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6643 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7],ymm12[8,9,10,11,12],ymm7[13,14,15]
6644 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0
6645 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4
6646 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
6647 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
6648 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
6649 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm27
6650 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7
6651 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0
6652 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7]
6653 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15
6654 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
6655 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
6656 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
6657 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
6658 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
6659 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
6660 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7]
6661 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm24, %zmm25, %zmm2
6662 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1}
6663 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7]
6664 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
6665 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
6666 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7]
6667 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,4,6,7]
6668 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
6669 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
6670 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
6671 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
6672 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
6673 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
6674 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
6675 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10
6676 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7]
6677 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0
6678 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
6679 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
6680 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
6681 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6682 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
6683 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
6684 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7]
6685 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1]
6686 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6687 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7]
6688 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6689 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6690 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
6691 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
6692 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm27, %zmm25, %zmm3
6693 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
6694 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6695 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm8
6696 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6697 ; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
6698 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9
6699 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rsi)
6700 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx)
6701 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx)
6702 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%r8)
6703 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%r9)
6704 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6705 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax)
6706 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6707 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax)
6708 ; AVX512DQ-SLOW-NEXT: vzeroupper
6709 ; AVX512DQ-SLOW-NEXT: retq
6711 ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32:
6712 ; AVX512DQ-FAST: # %bb.0:
6713 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26
6714 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
6715 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
6716 ; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
6717 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23
6718 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
6719 ; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
6720 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
6721 ; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
6722 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20
6723 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u]
6724 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm7
6725 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,4,8,11,15]
6726 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u]
6727 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10
6728 ; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm0
6729 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,u,u,u,4,7,11,14]
6730 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm3, %zmm9
6731 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
6732 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3
6733 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm19, %zmm4
6734 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25
6735 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2]
6736 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6737 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm5
6738 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm30
6739 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
6740 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7]
6741 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6742 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3
6743 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
6744 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
6745 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6746 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
6747 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
6748 ; AVX512DQ-FAST-NEXT: vporq %ymm6, %ymm11, %ymm22
6749 ; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm6
6750 ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm13
6751 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
6752 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
6753 ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0
6754 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
6755 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm22 {%k1}
6756 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5
6757 ; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm6
6758 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
6759 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6760 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3],xmm12[4],xmm11[5],xmm12[6,7]
6761 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
6762 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
6763 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm11[3,4,5,6],xmm9[7]
6764 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm9[4,5,6,7]
6765 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
6766 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6767 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
6768 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28
6769 ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm15
6770 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
6771 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
6772 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
6773 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6774 ; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0
6775 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6776 ; AVX512DQ-FAST-NEXT: vporq %ymm10, %ymm0, %ymm21
6777 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm12
6778 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10
6779 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7]
6780 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14
6781 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7]
6782 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6783 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6784 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
6785 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
6786 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7]
6787 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8
6788 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
6789 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6790 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
6791 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
6792 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
6793 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6794 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm7, %ymm8
6795 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
6796 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6797 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7]
6798 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
6799 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7
6800 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6801 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
6802 ; AVX512DQ-FAST-NEXT: vpermd %ymm25, %ymm11, %ymm11
6803 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6804 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6],ymm11[7]
6805 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
6806 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6807 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm2
6808 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm31
6809 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm16
6810 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
6811 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11
6812 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6],xmm11[7]
6813 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm27, %zmm11
6814 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6815 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
6816 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3,4,5,6],xmm11[7]
6817 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
6818 ; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm17, %zmm11
6819 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
6820 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
6821 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm29
6822 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
6823 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm14
6824 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7]
6825 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15]
6826 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,5,8,12,15]
6827 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
6828 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm27, %zmm2
6829 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
6830 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0
6831 ; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm2
6832 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2
6833 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm14, %zmm9
6834 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6835 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm19, %zmm16
6836 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6837 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
6838 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
6839 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1}
6840 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6841 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
6842 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
6843 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7]
6844 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6845 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm8
6846 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7]
6847 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
6848 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6849 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
6850 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6851 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3]
6852 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
6853 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7]
6854 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
6855 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm20
6856 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm2
6857 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm9
6858 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4,5],ymm2[6],ymm9[7]
6859 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm14
6860 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7]
6861 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
6862 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm11
6863 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3,4,5],xmm14[6],xmm11[7]
6864 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9]
6865 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
6866 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6867 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm14, %zmm14
6868 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6869 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7],ymm14[8,9,10],ymm7[11,12,13,14,15]
6870 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6871 ; AVX512DQ-FAST-NEXT: vpor %ymm14, %ymm11, %ymm11
6872 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
6873 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm19, %zmm20
6874 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm20 {%k1}
6875 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
6876 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
6877 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm11
6878 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8
6879 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
6880 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
6881 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,7,10,14,u,u,u]
6882 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm14
6883 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
6884 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm14
6885 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
6886 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
6887 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
6888 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
6889 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm12
6890 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0
6891 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6892 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6893 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7]
6894 ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm10
6895 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm15, %xmm12
6896 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
6897 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
6898 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm19, %zmm10
6899 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
6900 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
6901 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
6902 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
6903 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6904 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
6905 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,u,u,u,6,9,13,u]
6906 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm12, %zmm12
6907 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6908 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6909 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6910 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
6911 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6912 ; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm11, %ymm11
6913 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7]
6914 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm17, %zmm0
6915 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm18, %zmm12
6916 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm10 {%k1}
6917 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u]
6918 ; AVX512DQ-FAST-NEXT: vpermd %ymm25, %ymm11, %ymm11
6919 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
6920 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
6921 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
6922 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm13, %xmm12
6923 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
6924 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11
6925 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
6926 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
6927 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7
6928 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,4,7,11,14,u,u,u]
6929 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm14, %zmm14
6930 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
6931 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
6932 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
6933 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm12[2,3,4,5,6,7]
6934 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6,7]
6935 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm14
6936 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3,4,5],xmm14[6],xmm7[7]
6937 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,4,7,0,0,4,7,0]
6938 ; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1]
6939 ; AVX512DQ-FAST-NEXT: vpermd %ymm25, %ymm14, %ymm14
6940 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
6941 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
6942 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15]
6943 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
6944 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm14
6945 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm13
6946 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13
6947 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
6948 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14
6949 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2],xmm0[3],xmm14[4,5,6,7]
6950 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [3,u,u,u,6,10,13,u]
6951 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm14, %zmm14
6952 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6953 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6954 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6955 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7],ymm14[8,9,10],ymm7[11,12,13,14,15]
6956 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6957 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0
6958 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
6959 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm11, %zmm19, %zmm12
6960 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
6961 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,4,8,11,15,u,u,u]
6962 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm7
6963 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0
6964 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0
6965 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
6966 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
6967 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
6968 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
6969 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6970 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6971 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
6972 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
6973 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
6974 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
6975 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
6976 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
6977 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
6978 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6979 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3]
6980 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm4
6981 ; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm3, %zmm3
6982 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6983 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4,5,6,7]
6984 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6985 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6986 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
6987 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6988 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm13, %zmm19, %zmm0
6989 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
6990 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6991 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm7
6992 ; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
6993 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm4
6994 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rsi)
6995 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
6996 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rcx)
6997 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%r8)
6998 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r9)
6999 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
7000 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax)
7001 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
7002 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
7003 ; AVX512DQ-FAST-NEXT: vzeroupper
7004 ; AVX512DQ-FAST-NEXT: retq
7006 ; AVX512BW-LABEL: load_i16_stride7_vf32:
7007 ; AVX512BW: # %bb.0:
7008 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7009 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7010 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
7011 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6
7012 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7
7013 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7014 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
7015 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
7016 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
7017 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
7018 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7019 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
7020 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
7021 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
7022 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
7023 ; AVX512BW-NEXT: kmovd %edi, %k1
7024 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
7025 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
7026 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7027 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7028 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
7029 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7030 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
7031 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
7032 ; AVX512BW-NEXT: kmovd %edi, %k1
7033 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
7034 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
7035 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7036 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7037 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
7038 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7039 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
7040 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
7041 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
7042 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
7043 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7044 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
7045 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
7046 ; AVX512BW-NEXT: kmovd %edi, %k2
7047 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7048 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
7049 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
7050 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7051 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7052 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
7053 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7054 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
7055 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
7056 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7057 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
7058 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
7059 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
7060 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
7061 ; AVX512BW-NEXT: kmovd %edi, %k1
7062 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
7063 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
7064 ; AVX512BW-NEXT: kmovd %edi, %k2
7065 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
7066 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
7067 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7068 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7069 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
7070 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7071 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7072 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
7073 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7074 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
7075 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
7076 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
7077 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
7078 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
7079 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
7080 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7081 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7082 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
7083 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7084 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7085 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
7086 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7087 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
7088 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
7089 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
7090 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
7091 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
7092 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
7093 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7094 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7095 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
7096 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7097 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7098 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
7099 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7100 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
7101 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
7102 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
7103 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
7104 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
7105 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
7106 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7107 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
7108 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
7109 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7110 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
7111 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
7112 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7113 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
7114 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
7115 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
7116 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
7117 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
7118 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
7119 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx)
7120 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx)
7121 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8)
7122 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9)
7123 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
7124 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax)
7125 ; AVX512BW-NEXT: vzeroupper
7126 ; AVX512BW-NEXT: retq
7127 %wide.vec = load <224 x i16>, ptr %in.vec, align 64
7128 %strided.vec0 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
7129 %strided.vec1 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
7130 %strided.vec2 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
7131 %strided.vec3 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
7132 %strided.vec4 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
7133 %strided.vec5 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
7134 %strided.vec6 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
7135 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
7136 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
7137 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
7138 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
7139 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
7140 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
7141 store <32 x i16> %strided.vec6, ptr %out.vec6, align 64
7145 define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
7146 ; SSE-LABEL: load_i16_stride7_vf64:
7148 ; SSE-NEXT: subq $1352, %rsp # imm = 0x548
7149 ; SSE-NEXT: movdqa 640(%rdi), %xmm9
7150 ; SSE-NEXT: movdqa 624(%rdi), %xmm12
7151 ; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill
7152 ; SSE-NEXT: movdqa 112(%rdi), %xmm8
7153 ; SSE-NEXT: movdqa 128(%rdi), %xmm10
7154 ; SSE-NEXT: movaps 160(%rdi), %xmm6
7155 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7156 ; SSE-NEXT: movaps 144(%rdi), %xmm13
7157 ; SSE-NEXT: movdqa 192(%rdi), %xmm2
7158 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7159 ; SSE-NEXT: movdqa 176(%rdi), %xmm4
7160 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7161 ; SSE-NEXT: movdqa 208(%rdi), %xmm11
7162 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
7163 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7164 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
7165 ; SSE-NEXT: movdqa %xmm3, %xmm1
7166 ; SSE-NEXT: pandn %xmm0, %xmm1
7167 ; SSE-NEXT: movdqa %xmm4, %xmm0
7168 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7169 ; SSE-NEXT: pand %xmm3, %xmm0
7170 ; SSE-NEXT: por %xmm1, %xmm0
7171 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
7172 ; SSE-NEXT: movdqa %xmm14, %xmm1
7173 ; SSE-NEXT: pandn %xmm0, %xmm1
7174 ; SSE-NEXT: movaps %xmm13, %xmm0
7175 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7176 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2]
7177 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
7178 ; SSE-NEXT: movaps %xmm6, %xmm2
7179 ; SSE-NEXT: andnps %xmm0, %xmm2
7180 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
7181 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7182 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3]
7183 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7184 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
7185 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7186 ; SSE-NEXT: movdqa 656(%rdi), %xmm0
7187 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7188 ; SSE-NEXT: pand %xmm6, %xmm4
7189 ; SSE-NEXT: por %xmm2, %xmm4
7190 ; SSE-NEXT: pand %xmm14, %xmm4
7191 ; SSE-NEXT: por %xmm1, %xmm4
7192 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7193 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7194 ; SSE-NEXT: movdqa %xmm3, %xmm1
7195 ; SSE-NEXT: pandn %xmm0, %xmm1
7196 ; SSE-NEXT: movdqa %xmm12, %xmm0
7197 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
7198 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7199 ; SSE-NEXT: pand %xmm3, %xmm0
7200 ; SSE-NEXT: por %xmm1, %xmm0
7201 ; SSE-NEXT: movdqa %xmm14, %xmm1
7202 ; SSE-NEXT: pandn %xmm0, %xmm1
7203 ; SSE-NEXT: movaps 608(%rdi), %xmm2
7204 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7205 ; SSE-NEXT: movaps 592(%rdi), %xmm0
7206 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7207 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7208 ; SSE-NEXT: movaps %xmm6, %xmm2
7209 ; SSE-NEXT: andnps %xmm0, %xmm2
7210 ; SSE-NEXT: movdqa 560(%rdi), %xmm15
7211 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
7212 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7213 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7214 ; SSE-NEXT: movdqa 576(%rdi), %xmm5
7215 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
7216 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7217 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7218 ; SSE-NEXT: pand %xmm6, %xmm4
7219 ; SSE-NEXT: por %xmm2, %xmm4
7220 ; SSE-NEXT: pand %xmm14, %xmm4
7221 ; SSE-NEXT: por %xmm1, %xmm4
7222 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7223 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
7224 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7225 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7226 ; SSE-NEXT: movdqa %xmm3, %xmm1
7227 ; SSE-NEXT: pandn %xmm0, %xmm1
7228 ; SSE-NEXT: movdqa 80(%rdi), %xmm2
7229 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7230 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
7231 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7232 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7233 ; SSE-NEXT: pand %xmm3, %xmm0
7234 ; SSE-NEXT: por %xmm1, %xmm0
7235 ; SSE-NEXT: movdqa %xmm14, %xmm1
7236 ; SSE-NEXT: pandn %xmm0, %xmm1
7237 ; SSE-NEXT: movaps 32(%rdi), %xmm0
7238 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7239 ; SSE-NEXT: movaps 48(%rdi), %xmm4
7240 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7241 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
7242 ; SSE-NEXT: movaps %xmm6, %xmm2
7243 ; SSE-NEXT: andnps %xmm0, %xmm2
7244 ; SSE-NEXT: movdqa (%rdi), %xmm0
7245 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7246 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7247 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7248 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
7249 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7250 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7251 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7252 ; SSE-NEXT: pand %xmm6, %xmm4
7253 ; SSE-NEXT: por %xmm2, %xmm4
7254 ; SSE-NEXT: pand %xmm14, %xmm4
7255 ; SSE-NEXT: por %xmm1, %xmm4
7256 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7257 ; SSE-NEXT: movdqa 544(%rdi), %xmm0
7258 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7259 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7260 ; SSE-NEXT: movdqa %xmm3, %xmm1
7261 ; SSE-NEXT: pandn %xmm0, %xmm1
7262 ; SSE-NEXT: movdqa 528(%rdi), %xmm2
7263 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7264 ; SSE-NEXT: movdqa 512(%rdi), %xmm0
7265 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7266 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7267 ; SSE-NEXT: pand %xmm3, %xmm0
7268 ; SSE-NEXT: por %xmm1, %xmm0
7269 ; SSE-NEXT: movdqa %xmm14, %xmm1
7270 ; SSE-NEXT: pandn %xmm0, %xmm1
7271 ; SSE-NEXT: movaps 496(%rdi), %xmm2
7272 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7273 ; SSE-NEXT: movaps 480(%rdi), %xmm0
7274 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7275 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7276 ; SSE-NEXT: movaps %xmm6, %xmm2
7277 ; SSE-NEXT: andnps %xmm0, %xmm2
7278 ; SSE-NEXT: movdqa 448(%rdi), %xmm0
7279 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7280 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7281 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7282 ; SSE-NEXT: movdqa 464(%rdi), %xmm12
7283 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
7284 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7285 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7286 ; SSE-NEXT: pand %xmm6, %xmm4
7287 ; SSE-NEXT: por %xmm2, %xmm4
7288 ; SSE-NEXT: pand %xmm14, %xmm4
7289 ; SSE-NEXT: por %xmm1, %xmm4
7290 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7291 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
7292 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7293 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7294 ; SSE-NEXT: movdqa %xmm3, %xmm1
7295 ; SSE-NEXT: pandn %xmm0, %xmm1
7296 ; SSE-NEXT: movdqa 416(%rdi), %xmm2
7297 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7298 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
7299 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7300 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7301 ; SSE-NEXT: pand %xmm3, %xmm0
7302 ; SSE-NEXT: por %xmm1, %xmm0
7303 ; SSE-NEXT: movdqa %xmm14, %xmm1
7304 ; SSE-NEXT: pandn %xmm0, %xmm1
7305 ; SSE-NEXT: movaps 384(%rdi), %xmm2
7306 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7307 ; SSE-NEXT: movaps 368(%rdi), %xmm0
7308 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7309 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7310 ; SSE-NEXT: movaps %xmm6, %xmm2
7311 ; SSE-NEXT: andnps %xmm0, %xmm2
7312 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
7313 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7314 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7315 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7316 ; SSE-NEXT: movdqa 352(%rdi), %xmm0
7317 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7318 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7319 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7320 ; SSE-NEXT: pand %xmm6, %xmm4
7321 ; SSE-NEXT: por %xmm2, %xmm4
7322 ; SSE-NEXT: pand %xmm14, %xmm4
7323 ; SSE-NEXT: por %xmm1, %xmm4
7324 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7325 ; SSE-NEXT: movdqa 880(%rdi), %xmm0
7326 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7327 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7328 ; SSE-NEXT: movdqa %xmm3, %xmm1
7329 ; SSE-NEXT: pandn %xmm0, %xmm1
7330 ; SSE-NEXT: movdqa 864(%rdi), %xmm4
7331 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7332 ; SSE-NEXT: movdqa 848(%rdi), %xmm0
7333 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7334 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7335 ; SSE-NEXT: pand %xmm3, %xmm0
7336 ; SSE-NEXT: por %xmm1, %xmm0
7337 ; SSE-NEXT: movdqa %xmm14, %xmm1
7338 ; SSE-NEXT: pandn %xmm0, %xmm1
7339 ; SSE-NEXT: movaps 832(%rdi), %xmm2
7340 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7341 ; SSE-NEXT: movaps 816(%rdi), %xmm0
7342 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7343 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7344 ; SSE-NEXT: movaps %xmm6, %xmm2
7345 ; SSE-NEXT: andnps %xmm0, %xmm2
7346 ; SSE-NEXT: movdqa 784(%rdi), %xmm0
7347 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7348 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7349 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7350 ; SSE-NEXT: movdqa 800(%rdi), %xmm0
7351 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7352 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7353 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7354 ; SSE-NEXT: pand %xmm6, %xmm4
7355 ; SSE-NEXT: por %xmm2, %xmm4
7356 ; SSE-NEXT: pand %xmm14, %xmm4
7357 ; SSE-NEXT: por %xmm1, %xmm4
7358 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7359 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
7360 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7361 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7362 ; SSE-NEXT: movdqa %xmm3, %xmm1
7363 ; SSE-NEXT: pandn %xmm0, %xmm1
7364 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
7365 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7366 ; SSE-NEXT: movdqa 288(%rdi), %xmm0
7367 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7368 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7369 ; SSE-NEXT: pand %xmm3, %xmm0
7370 ; SSE-NEXT: por %xmm1, %xmm0
7371 ; SSE-NEXT: movdqa %xmm14, %xmm1
7372 ; SSE-NEXT: pandn %xmm0, %xmm1
7373 ; SSE-NEXT: movaps 272(%rdi), %xmm2
7374 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7375 ; SSE-NEXT: movaps 256(%rdi), %xmm0
7376 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7377 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7378 ; SSE-NEXT: movaps %xmm6, %xmm2
7379 ; SSE-NEXT: andnps %xmm0, %xmm2
7380 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
7381 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7382 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7383 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7384 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
7385 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7386 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7387 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7388 ; SSE-NEXT: pand %xmm6, %xmm4
7389 ; SSE-NEXT: por %xmm2, %xmm4
7390 ; SSE-NEXT: pand %xmm14, %xmm4
7391 ; SSE-NEXT: por %xmm1, %xmm4
7392 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7393 ; SSE-NEXT: movdqa 768(%rdi), %xmm0
7394 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7395 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7396 ; SSE-NEXT: movdqa %xmm3, %xmm1
7397 ; SSE-NEXT: pandn %xmm0, %xmm1
7398 ; SSE-NEXT: movdqa 752(%rdi), %xmm2
7399 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7400 ; SSE-NEXT: movdqa 736(%rdi), %xmm0
7401 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7402 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7403 ; SSE-NEXT: pand %xmm3, %xmm0
7404 ; SSE-NEXT: por %xmm1, %xmm0
7405 ; SSE-NEXT: movdqa %xmm14, %xmm1
7406 ; SSE-NEXT: pandn %xmm0, %xmm1
7407 ; SSE-NEXT: movaps 720(%rdi), %xmm2
7408 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7409 ; SSE-NEXT: movaps 704(%rdi), %xmm0
7410 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7411 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7412 ; SSE-NEXT: movaps %xmm6, %xmm2
7413 ; SSE-NEXT: andnps %xmm0, %xmm2
7414 ; SSE-NEXT: movdqa 672(%rdi), %xmm0
7415 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7416 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7417 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7418 ; SSE-NEXT: movdqa 688(%rdi), %xmm0
7419 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7420 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7421 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7422 ; SSE-NEXT: pand %xmm6, %xmm4
7423 ; SSE-NEXT: por %xmm2, %xmm4
7424 ; SSE-NEXT: pand %xmm14, %xmm4
7425 ; SSE-NEXT: por %xmm1, %xmm4
7426 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7427 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
7428 ; SSE-NEXT: movdqa %xmm3, %xmm1
7429 ; SSE-NEXT: pandn %xmm11, %xmm1
7430 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7431 ; SSE-NEXT: psrld $16, %xmm0
7432 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7433 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7434 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7435 ; SSE-NEXT: pand %xmm3, %xmm2
7436 ; SSE-NEXT: por %xmm1, %xmm2
7437 ; SSE-NEXT: movdqa %xmm14, %xmm1
7438 ; SSE-NEXT: pandn %xmm2, %xmm1
7439 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
7440 ; SSE-NEXT: movdqa %xmm7, %xmm2
7441 ; SSE-NEXT: pandn %xmm8, %xmm2
7442 ; SSE-NEXT: pand %xmm7, %xmm10
7443 ; SSE-NEXT: por %xmm2, %xmm10
7444 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
7445 ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
7446 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1]
7447 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7448 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7449 ; SSE-NEXT: movdqa %xmm6, %xmm0
7450 ; SSE-NEXT: pandn %xmm2, %xmm0
7451 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3]
7452 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7453 ; SSE-NEXT: pand %xmm6, %xmm2
7454 ; SSE-NEXT: por %xmm2, %xmm0
7455 ; SSE-NEXT: pand %xmm14, %xmm0
7456 ; SSE-NEXT: por %xmm1, %xmm0
7457 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7458 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7459 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7460 ; SSE-NEXT: movdqa %xmm3, %xmm2
7461 ; SSE-NEXT: pandn %xmm1, %xmm2
7462 ; SSE-NEXT: psrld $16, %xmm9
7463 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
7464 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7465 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
7466 ; SSE-NEXT: pand %xmm3, %xmm4
7467 ; SSE-NEXT: por %xmm2, %xmm4
7468 ; SSE-NEXT: movdqa %xmm14, %xmm1
7469 ; SSE-NEXT: pandn %xmm4, %xmm1
7470 ; SSE-NEXT: movdqa %xmm7, %xmm2
7471 ; SSE-NEXT: pandn %xmm15, %xmm2
7472 ; SSE-NEXT: pand %xmm7, %xmm5
7473 ; SSE-NEXT: por %xmm2, %xmm5
7474 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7475 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7476 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7477 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7478 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7479 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7480 ; SSE-NEXT: movdqa %xmm6, %xmm0
7481 ; SSE-NEXT: pandn %xmm2, %xmm0
7482 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3]
7483 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7484 ; SSE-NEXT: pand %xmm6, %xmm2
7485 ; SSE-NEXT: por %xmm2, %xmm0
7486 ; SSE-NEXT: pand %xmm14, %xmm0
7487 ; SSE-NEXT: por %xmm1, %xmm0
7488 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7489 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7490 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7491 ; SSE-NEXT: movdqa %xmm3, %xmm2
7492 ; SSE-NEXT: pandn %xmm1, %xmm2
7493 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7494 ; SSE-NEXT: psrld $16, %xmm1
7495 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7496 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7497 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7498 ; SSE-NEXT: pand %xmm3, %xmm4
7499 ; SSE-NEXT: por %xmm2, %xmm4
7500 ; SSE-NEXT: movdqa %xmm14, %xmm1
7501 ; SSE-NEXT: pandn %xmm4, %xmm1
7502 ; SSE-NEXT: movdqa %xmm7, %xmm2
7503 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7504 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7505 ; SSE-NEXT: pand %xmm7, %xmm4
7506 ; SSE-NEXT: por %xmm2, %xmm4
7507 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7508 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7509 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7510 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7511 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7512 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7513 ; SSE-NEXT: movdqa %xmm6, %xmm0
7514 ; SSE-NEXT: pandn %xmm2, %xmm0
7515 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7516 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7517 ; SSE-NEXT: pand %xmm6, %xmm2
7518 ; SSE-NEXT: por %xmm2, %xmm0
7519 ; SSE-NEXT: pand %xmm14, %xmm0
7520 ; SSE-NEXT: por %xmm1, %xmm0
7521 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7522 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7523 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7524 ; SSE-NEXT: movdqa %xmm3, %xmm2
7525 ; SSE-NEXT: pandn %xmm1, %xmm2
7526 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7527 ; SSE-NEXT: psrld $16, %xmm1
7528 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7529 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7530 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7531 ; SSE-NEXT: pand %xmm3, %xmm4
7532 ; SSE-NEXT: por %xmm2, %xmm4
7533 ; SSE-NEXT: movdqa %xmm14, %xmm1
7534 ; SSE-NEXT: pandn %xmm4, %xmm1
7535 ; SSE-NEXT: movdqa %xmm7, %xmm2
7536 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7537 ; SSE-NEXT: pand %xmm7, %xmm12
7538 ; SSE-NEXT: por %xmm2, %xmm12
7539 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7540 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7541 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7542 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7543 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7544 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7545 ; SSE-NEXT: movdqa %xmm6, %xmm0
7546 ; SSE-NEXT: pandn %xmm2, %xmm0
7547 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3]
7548 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7549 ; SSE-NEXT: pand %xmm6, %xmm2
7550 ; SSE-NEXT: por %xmm2, %xmm0
7551 ; SSE-NEXT: pand %xmm14, %xmm0
7552 ; SSE-NEXT: por %xmm1, %xmm0
7553 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7554 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7555 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7556 ; SSE-NEXT: movdqa %xmm3, %xmm2
7557 ; SSE-NEXT: pandn %xmm1, %xmm2
7558 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7559 ; SSE-NEXT: psrld $16, %xmm1
7560 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7561 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7562 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7563 ; SSE-NEXT: pand %xmm3, %xmm4
7564 ; SSE-NEXT: por %xmm2, %xmm4
7565 ; SSE-NEXT: movdqa %xmm14, %xmm1
7566 ; SSE-NEXT: pandn %xmm4, %xmm1
7567 ; SSE-NEXT: movdqa %xmm7, %xmm2
7568 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7569 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7570 ; SSE-NEXT: pand %xmm7, %xmm4
7571 ; SSE-NEXT: por %xmm2, %xmm4
7572 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7573 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7574 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7575 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7576 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7577 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7578 ; SSE-NEXT: movdqa %xmm6, %xmm0
7579 ; SSE-NEXT: pandn %xmm2, %xmm0
7580 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7581 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7582 ; SSE-NEXT: pand %xmm6, %xmm2
7583 ; SSE-NEXT: por %xmm2, %xmm0
7584 ; SSE-NEXT: pand %xmm14, %xmm0
7585 ; SSE-NEXT: por %xmm1, %xmm0
7586 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7587 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7588 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7589 ; SSE-NEXT: movdqa %xmm3, %xmm2
7590 ; SSE-NEXT: pandn %xmm1, %xmm2
7591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7592 ; SSE-NEXT: psrld $16, %xmm1
7593 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7594 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7595 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7596 ; SSE-NEXT: pand %xmm3, %xmm4
7597 ; SSE-NEXT: por %xmm2, %xmm4
7598 ; SSE-NEXT: movdqa %xmm14, %xmm1
7599 ; SSE-NEXT: pandn %xmm4, %xmm1
7600 ; SSE-NEXT: movdqa %xmm7, %xmm2
7601 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7602 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7603 ; SSE-NEXT: pand %xmm7, %xmm4
7604 ; SSE-NEXT: por %xmm2, %xmm4
7605 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7606 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7607 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7608 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7609 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7610 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7611 ; SSE-NEXT: movdqa %xmm6, %xmm0
7612 ; SSE-NEXT: pandn %xmm2, %xmm0
7613 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7614 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7615 ; SSE-NEXT: pand %xmm6, %xmm2
7616 ; SSE-NEXT: por %xmm2, %xmm0
7617 ; SSE-NEXT: pand %xmm14, %xmm0
7618 ; SSE-NEXT: por %xmm1, %xmm0
7619 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7620 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7621 ; SSE-NEXT: movdqa %xmm10, %xmm1
7622 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7623 ; SSE-NEXT: movdqa %xmm3, %xmm2
7624 ; SSE-NEXT: pandn %xmm1, %xmm2
7625 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7626 ; SSE-NEXT: movdqa %xmm11, %xmm1
7627 ; SSE-NEXT: psrld $16, %xmm1
7628 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7629 ; SSE-NEXT: movdqa %xmm9, %xmm4
7630 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7631 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7632 ; SSE-NEXT: pand %xmm3, %xmm4
7633 ; SSE-NEXT: por %xmm2, %xmm4
7634 ; SSE-NEXT: movdqa %xmm14, %xmm1
7635 ; SSE-NEXT: pandn %xmm4, %xmm1
7636 ; SSE-NEXT: movdqa %xmm7, %xmm2
7637 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7638 ; SSE-NEXT: pandn %xmm12, %xmm2
7639 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7640 ; SSE-NEXT: movdqa %xmm13, %xmm4
7641 ; SSE-NEXT: pand %xmm7, %xmm4
7642 ; SSE-NEXT: por %xmm2, %xmm4
7643 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7644 ; SSE-NEXT: movdqa %xmm5, %xmm2
7645 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7646 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
7647 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7648 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7649 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7650 ; SSE-NEXT: movdqa %xmm6, %xmm15
7651 ; SSE-NEXT: pandn %xmm2, %xmm15
7652 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7653 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7654 ; SSE-NEXT: pand %xmm6, %xmm2
7655 ; SSE-NEXT: por %xmm2, %xmm15
7656 ; SSE-NEXT: pand %xmm14, %xmm15
7657 ; SSE-NEXT: por %xmm1, %xmm15
7658 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7659 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7660 ; SSE-NEXT: movdqa %xmm15, %xmm1
7661 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7662 ; SSE-NEXT: movdqa %xmm3, %xmm2
7663 ; SSE-NEXT: pandn %xmm1, %xmm2
7664 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7665 ; SSE-NEXT: psrld $16, %xmm1
7666 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7667 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7668 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7669 ; SSE-NEXT: pand %xmm3, %xmm4
7670 ; SSE-NEXT: por %xmm2, %xmm4
7671 ; SSE-NEXT: movdqa %xmm14, %xmm1
7672 ; SSE-NEXT: pandn %xmm4, %xmm1
7673 ; SSE-NEXT: movdqa %xmm7, %xmm2
7674 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7675 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7676 ; SSE-NEXT: pand %xmm7, %xmm4
7677 ; SSE-NEXT: por %xmm2, %xmm4
7678 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7679 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7680 ; SSE-NEXT: pand %xmm6, %xmm2
7681 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7682 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7683 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
7684 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
7685 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
7686 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
7687 ; SSE-NEXT: pandn %xmm4, %xmm6
7688 ; SSE-NEXT: por %xmm2, %xmm6
7689 ; SSE-NEXT: pand %xmm14, %xmm6
7690 ; SSE-NEXT: por %xmm1, %xmm6
7691 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7692 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7693 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
7694 ; SSE-NEXT: movdqa %xmm3, %xmm2
7695 ; SSE-NEXT: pandn %xmm1, %xmm2
7696 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7697 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7698 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
7699 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
7700 ; SSE-NEXT: pand %xmm3, %xmm1
7701 ; SSE-NEXT: por %xmm2, %xmm1
7702 ; SSE-NEXT: movdqa %xmm14, %xmm2
7703 ; SSE-NEXT: pandn %xmm1, %xmm2
7704 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7705 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
7706 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
7707 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7708 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
7709 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
7710 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7711 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7712 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
7713 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7714 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
7715 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7716 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
7717 ; SSE-NEXT: andps %xmm14, %xmm4
7718 ; SSE-NEXT: orps %xmm2, %xmm4
7719 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7720 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7721 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
7722 ; SSE-NEXT: movdqa %xmm3, %xmm2
7723 ; SSE-NEXT: pandn %xmm1, %xmm2
7724 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7725 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7726 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
7727 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
7728 ; SSE-NEXT: pand %xmm3, %xmm1
7729 ; SSE-NEXT: por %xmm2, %xmm1
7730 ; SSE-NEXT: movdqa %xmm14, %xmm2
7731 ; SSE-NEXT: pandn %xmm1, %xmm2
7732 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7733 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
7734 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7]
7735 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7736 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
7737 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1]
7738 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7739 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7740 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
7741 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7742 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
7743 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
7744 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3]
7745 ; SSE-NEXT: andps %xmm14, %xmm6
7746 ; SSE-NEXT: orps %xmm2, %xmm6
7747 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7748 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
7749 ; SSE-NEXT: movdqa %xmm3, %xmm4
7750 ; SSE-NEXT: pandn %xmm2, %xmm4
7751 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
7752 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
7753 ; SSE-NEXT: pand %xmm3, %xmm2
7754 ; SSE-NEXT: por %xmm4, %xmm2
7755 ; SSE-NEXT: movdqa %xmm14, %xmm4
7756 ; SSE-NEXT: pandn %xmm2, %xmm4
7757 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
7758 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
7759 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
7760 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7761 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
7762 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7763 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3]
7764 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
7765 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
7766 ; SSE-NEXT: andps %xmm14, %xmm0
7767 ; SSE-NEXT: orps %xmm4, %xmm0
7768 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7769 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7770 ; SSE-NEXT: # xmm2 = mem[0,1,0,1]
7771 ; SSE-NEXT: movdqa %xmm3, %xmm4
7772 ; SSE-NEXT: pandn %xmm2, %xmm4
7773 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
7774 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7775 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
7776 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
7777 ; SSE-NEXT: pand %xmm3, %xmm2
7778 ; SSE-NEXT: por %xmm4, %xmm2
7779 ; SSE-NEXT: movdqa %xmm14, %xmm4
7780 ; SSE-NEXT: pandn %xmm2, %xmm4
7781 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7782 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
7783 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
7784 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7785 ; SSE-NEXT: # xmm2 = mem[2,2,3,3]
7786 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7788 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7789 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7790 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
7791 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
7792 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
7793 ; SSE-NEXT: andps %xmm14, %xmm0
7794 ; SSE-NEXT: orps %xmm4, %xmm0
7795 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7796 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7797 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
7798 ; SSE-NEXT: movdqa %xmm3, %xmm6
7799 ; SSE-NEXT: pandn %xmm4, %xmm6
7800 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7801 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7802 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
7803 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
7804 ; SSE-NEXT: pand %xmm3, %xmm4
7805 ; SSE-NEXT: por %xmm6, %xmm4
7806 ; SSE-NEXT: movdqa %xmm14, %xmm6
7807 ; SSE-NEXT: pandn %xmm4, %xmm6
7808 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7809 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
7810 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
7811 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7812 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
7813 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
7814 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7815 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
7816 ; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
7817 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3]
7818 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
7819 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
7820 ; SSE-NEXT: andps %xmm14, %xmm0
7821 ; SSE-NEXT: orps %xmm6, %xmm0
7822 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7823 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7824 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
7825 ; SSE-NEXT: movdqa %xmm3, %xmm6
7826 ; SSE-NEXT: pandn %xmm4, %xmm6
7827 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7828 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7829 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
7830 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
7831 ; SSE-NEXT: pand %xmm3, %xmm4
7832 ; SSE-NEXT: por %xmm6, %xmm4
7833 ; SSE-NEXT: movdqa %xmm14, %xmm6
7834 ; SSE-NEXT: pandn %xmm4, %xmm6
7835 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7836 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
7837 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
7838 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7839 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
7840 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
7841 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7842 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7843 ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
7844 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3]
7845 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
7846 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3]
7847 ; SSE-NEXT: andps %xmm14, %xmm0
7848 ; SSE-NEXT: orps %xmm6, %xmm0
7849 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7850 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1]
7851 ; SSE-NEXT: movdqa %xmm3, %xmm10
7852 ; SSE-NEXT: pandn %xmm6, %xmm10
7853 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7854 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7855 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
7856 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
7857 ; SSE-NEXT: pand %xmm3, %xmm6
7858 ; SSE-NEXT: por %xmm10, %xmm6
7859 ; SSE-NEXT: movdqa %xmm14, %xmm11
7860 ; SSE-NEXT: pandn %xmm6, %xmm11
7861 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7862 ; SSE-NEXT: # xmm6 = mem[0,1,0,3]
7863 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7]
7864 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7865 ; SSE-NEXT: # xmm6 = mem[2,2,3,3]
7866 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1]
7867 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7868 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
7869 ; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3]
7870 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3]
7871 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
7872 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
7873 ; SSE-NEXT: andps %xmm14, %xmm0
7874 ; SSE-NEXT: orps %xmm11, %xmm0
7875 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7876 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7877 ; SSE-NEXT: movdqa %xmm0, %xmm6
7878 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7879 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
7880 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
7881 ; SSE-NEXT: pand %xmm3, %xmm6
7882 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7883 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
7884 ; SSE-NEXT: pandn %xmm11, %xmm3
7885 ; SSE-NEXT: por %xmm6, %xmm3
7886 ; SSE-NEXT: movdqa %xmm14, %xmm6
7887 ; SSE-NEXT: pandn %xmm3, %xmm6
7888 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7889 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3]
7890 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7]
7891 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7892 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
7893 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1]
7894 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7895 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7896 ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
7897 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3]
7898 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
7899 ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3]
7900 ; SSE-NEXT: andps %xmm14, %xmm5
7901 ; SSE-NEXT: orps %xmm6, %xmm5
7902 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7903 ; SSE-NEXT: movdqa %xmm7, %xmm6
7904 ; SSE-NEXT: pandn %xmm13, %xmm6
7905 ; SSE-NEXT: movdqa %xmm15, %xmm11
7906 ; SSE-NEXT: pand %xmm7, %xmm11
7907 ; SSE-NEXT: por %xmm6, %xmm11
7908 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7]
7909 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
7910 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
7911 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
7912 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3]
7913 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
7914 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
7915 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
7916 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
7917 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
7918 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
7919 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
7920 ; SSE-NEXT: movdqa %xmm14, %xmm0
7921 ; SSE-NEXT: pandn %xmm3, %xmm0
7922 ; SSE-NEXT: andps %xmm14, %xmm6
7923 ; SSE-NEXT: por %xmm6, %xmm0
7924 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7925 ; SSE-NEXT: movdqa %xmm7, %xmm3
7926 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7927 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7928 ; SSE-NEXT: movdqa %xmm9, %xmm6
7929 ; SSE-NEXT: pand %xmm7, %xmm6
7930 ; SSE-NEXT: por %xmm3, %xmm6
7931 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7]
7932 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
7933 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7934 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
7935 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
7936 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
7937 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7938 ; SSE-NEXT: movdqa %xmm0, %xmm5
7939 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7940 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
7941 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
7942 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
7943 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7944 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
7945 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
7946 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
7947 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
7948 ; SSE-NEXT: movdqa %xmm14, %xmm1
7949 ; SSE-NEXT: pandn %xmm5, %xmm1
7950 ; SSE-NEXT: andps %xmm14, %xmm3
7951 ; SSE-NEXT: por %xmm3, %xmm1
7952 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7953 ; SSE-NEXT: movdqa %xmm7, %xmm3
7954 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7955 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7956 ; SSE-NEXT: pand %xmm7, %xmm5
7957 ; SSE-NEXT: por %xmm3, %xmm5
7958 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
7959 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
7960 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7961 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
7962 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
7963 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
7964 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7965 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7966 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
7967 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
7968 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
7969 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
7970 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
7971 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
7972 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
7973 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
7974 ; SSE-NEXT: movdqa %xmm14, %xmm1
7975 ; SSE-NEXT: pandn %xmm5, %xmm1
7976 ; SSE-NEXT: andps %xmm14, %xmm3
7977 ; SSE-NEXT: por %xmm3, %xmm1
7978 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7979 ; SSE-NEXT: movdqa %xmm7, %xmm3
7980 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7981 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7982 ; SSE-NEXT: pand %xmm7, %xmm5
7983 ; SSE-NEXT: por %xmm3, %xmm5
7984 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
7985 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
7986 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7987 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
7988 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
7989 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
7990 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7991 ; SSE-NEXT: movdqa %xmm11, %xmm1
7992 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7993 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
7994 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
7995 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
7996 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7997 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
7998 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
7999 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8000 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
8001 ; SSE-NEXT: movdqa %xmm14, %xmm13
8002 ; SSE-NEXT: pandn %xmm1, %xmm13
8003 ; SSE-NEXT: andps %xmm14, %xmm3
8004 ; SSE-NEXT: por %xmm3, %xmm13
8005 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8006 ; SSE-NEXT: movdqa %xmm7, %xmm1
8007 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8008 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8009 ; SSE-NEXT: pand %xmm7, %xmm3
8010 ; SSE-NEXT: por %xmm1, %xmm3
8011 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
8012 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8013 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7]
8014 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
8015 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
8016 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8017 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8018 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8019 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
8020 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
8021 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8022 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8023 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
8024 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
8025 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
8026 ; SSE-NEXT: movdqa %xmm14, %xmm13
8027 ; SSE-NEXT: pandn %xmm3, %xmm13
8028 ; SSE-NEXT: andps %xmm14, %xmm1
8029 ; SSE-NEXT: por %xmm1, %xmm13
8030 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8031 ; SSE-NEXT: movdqa %xmm7, %xmm1
8032 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8033 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8034 ; SSE-NEXT: pand %xmm7, %xmm3
8035 ; SSE-NEXT: por %xmm1, %xmm3
8036 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
8037 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8038 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
8039 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8040 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8041 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8042 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8043 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8044 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8045 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8046 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8047 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8048 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8049 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8050 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8051 ; SSE-NEXT: movdqa %xmm14, %xmm3
8052 ; SSE-NEXT: pandn %xmm2, %xmm3
8053 ; SSE-NEXT: andps %xmm14, %xmm1
8054 ; SSE-NEXT: por %xmm1, %xmm3
8055 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8056 ; SSE-NEXT: movdqa %xmm7, %xmm1
8057 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8058 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8059 ; SSE-NEXT: pand %xmm7, %xmm2
8060 ; SSE-NEXT: por %xmm1, %xmm2
8061 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7]
8062 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8063 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7]
8064 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8065 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8066 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8067 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8068 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8069 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8070 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8071 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8072 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8073 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8074 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8075 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8076 ; SSE-NEXT: movdqa %xmm14, %xmm10
8077 ; SSE-NEXT: pandn %xmm2, %xmm10
8078 ; SSE-NEXT: andps %xmm14, %xmm1
8079 ; SSE-NEXT: por %xmm1, %xmm10
8080 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8081 ; SSE-NEXT: movdqa %xmm7, %xmm1
8082 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8083 ; SSE-NEXT: pand %xmm7, %xmm12
8084 ; SSE-NEXT: por %xmm1, %xmm12
8085 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7]
8086 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8087 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
8088 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8089 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8090 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8091 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8092 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8093 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8094 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8095 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8096 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8097 ; SSE-NEXT: andps %xmm14, %xmm1
8098 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8099 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8100 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8101 ; SSE-NEXT: pandn %xmm2, %xmm14
8102 ; SSE-NEXT: por %xmm1, %xmm14
8103 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8104 ; SSE-NEXT: movdqa %xmm2, %xmm1
8105 ; SSE-NEXT: psrld $16, %xmm1
8106 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8107 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
8108 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8109 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8110 ; SSE-NEXT: movdqa %xmm2, %xmm4
8111 ; SSE-NEXT: movdqa %xmm10, %xmm2
8112 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
8113 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8114 ; SSE-NEXT: psrlq $48, %xmm2
8115 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8116 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8117 ; SSE-NEXT: psrlq $16, %xmm2
8118 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8119 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8120 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8121 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8122 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8123 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8124 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8125 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8126 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8127 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8128 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8129 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8130 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8131 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8132 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
8133 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
8134 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8135 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8136 ; SSE-NEXT: movdqa %xmm13, %xmm1
8137 ; SSE-NEXT: psrld $16, %xmm1
8138 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8139 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8140 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8141 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
8142 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
8143 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8144 ; SSE-NEXT: psrlq $48, %xmm2
8145 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8146 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8147 ; SSE-NEXT: psrlq $16, %xmm2
8148 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
8149 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8150 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8151 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
8152 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8153 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
8154 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8155 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8156 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8157 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8158 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8159 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8160 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8161 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8162 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8163 ; SSE-NEXT: movdqa %xmm0, %xmm1
8164 ; SSE-NEXT: psrld $16, %xmm1
8165 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8166 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8167 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8168 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8169 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8170 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8171 ; SSE-NEXT: psrlq $48, %xmm2
8172 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8173 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8174 ; SSE-NEXT: psrlq $16, %xmm2
8175 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8176 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8177 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8178 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8179 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8180 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8181 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8182 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8183 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8184 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8185 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8186 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8187 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8188 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8189 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8190 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8191 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8192 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8193 ; SSE-NEXT: movdqa %xmm0, %xmm1
8194 ; SSE-NEXT: psrld $16, %xmm1
8195 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8196 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8197 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8198 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8199 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8200 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8201 ; SSE-NEXT: psrlq $48, %xmm2
8202 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8203 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8204 ; SSE-NEXT: psrlq $16, %xmm2
8205 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8206 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8207 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8208 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8209 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3]
8210 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8211 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
8212 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
8213 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8214 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8215 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8216 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8217 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8218 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8219 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8220 ; SSE-NEXT: movdqa %xmm2, %xmm1
8221 ; SSE-NEXT: psrld $16, %xmm1
8222 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8223 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8224 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8225 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8226 ; SSE-NEXT: movdqa %xmm2, %xmm4
8227 ; SSE-NEXT: movdqa %xmm0, %xmm2
8228 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
8229 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8230 ; SSE-NEXT: psrlq $48, %xmm2
8231 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8232 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8233 ; SSE-NEXT: psrlq $16, %xmm2
8234 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8235 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8236 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8237 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8238 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8239 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8240 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8241 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
8242 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8243 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8244 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8245 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8246 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8247 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8248 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8249 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8250 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8251 ; SSE-NEXT: movdqa %xmm12, %xmm13
8252 ; SSE-NEXT: psrld $16, %xmm13
8253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8254 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
8255 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
8256 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
8257 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8258 ; SSE-NEXT: psrlq $48, %xmm2
8259 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8260 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8261 ; SSE-NEXT: psrlq $16, %xmm2
8262 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8263 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
8264 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8265 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8266 ; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload
8267 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8268 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8269 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8270 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8271 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8272 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8273 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8274 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8275 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8276 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8277 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8278 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8279 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8280 ; SSE-NEXT: movdqa %xmm0, %xmm12
8281 ; SSE-NEXT: psrld $16, %xmm12
8282 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8283 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
8284 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8285 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8286 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8287 ; SSE-NEXT: psrlq $48, %xmm2
8288 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8289 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8290 ; SSE-NEXT: movdqa %xmm9, %xmm2
8291 ; SSE-NEXT: psrlq $16, %xmm2
8292 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8293 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
8294 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8295 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8296 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8297 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8298 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8299 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8300 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8301 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8302 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8303 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8304 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8305 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8306 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8307 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8308 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8309 ; SSE-NEXT: movdqa %xmm2, %xmm11
8310 ; SSE-NEXT: psrld $16, %xmm11
8311 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8312 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
8313 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8314 ; SSE-NEXT: movdqa %xmm2, %xmm3
8315 ; SSE-NEXT: movdqa %xmm0, %xmm2
8316 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8317 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8318 ; SSE-NEXT: psrlq $48, %xmm2
8319 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8320 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8321 ; SSE-NEXT: movdqa %xmm4, %xmm2
8322 ; SSE-NEXT: psrlq $16, %xmm2
8323 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8324 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
8325 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8326 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8327 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8328 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
8329 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8330 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8331 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8332 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8333 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8334 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8335 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8336 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8337 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8338 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8339 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8340 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8341 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8342 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8343 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8344 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8346 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8347 ; SSE-NEXT: movdqa %xmm7, %xmm1
8348 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8349 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8350 ; SSE-NEXT: pand %xmm7, %xmm2
8351 ; SSE-NEXT: por %xmm1, %xmm2
8352 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8353 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8354 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8355 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8356 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8357 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8358 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8359 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8360 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8361 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8362 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8363 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8364 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8365 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8366 ; SSE-NEXT: movdqa %xmm7, %xmm1
8367 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8368 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8369 ; SSE-NEXT: pand %xmm7, %xmm2
8370 ; SSE-NEXT: por %xmm1, %xmm2
8371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8372 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8373 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8374 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8375 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8376 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8377 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8378 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8379 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8380 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8381 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8382 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8383 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8384 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8385 ; SSE-NEXT: movdqa %xmm7, %xmm1
8386 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8387 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8388 ; SSE-NEXT: pand %xmm7, %xmm2
8389 ; SSE-NEXT: por %xmm1, %xmm2
8390 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8391 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8392 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8393 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8394 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8395 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8396 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8397 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8398 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8399 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8400 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8401 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8402 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8403 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8404 ; SSE-NEXT: movdqa %xmm7, %xmm1
8405 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8406 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8407 ; SSE-NEXT: pand %xmm7, %xmm2
8408 ; SSE-NEXT: por %xmm1, %xmm2
8409 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8410 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8411 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8412 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8413 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8414 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8415 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8416 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8417 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
8418 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8419 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8420 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
8421 ; SSE-NEXT: movdqa %xmm7, %xmm1
8422 ; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload
8423 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8424 ; SSE-NEXT: pand %xmm7, %xmm2
8425 ; SSE-NEXT: por %xmm1, %xmm2
8426 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8427 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8428 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8429 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8430 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8431 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
8432 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8433 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7]
8434 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8435 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
8436 ; SSE-NEXT: movdqa %xmm7, %xmm1
8437 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8438 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8439 ; SSE-NEXT: pand %xmm7, %xmm2
8440 ; SSE-NEXT: por %xmm1, %xmm2
8441 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8442 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8443 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8444 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8445 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8446 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2]
8447 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
8448 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7]
8449 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8450 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
8451 ; SSE-NEXT: movdqa %xmm7, %xmm1
8452 ; SSE-NEXT: pandn %xmm5, %xmm1
8453 ; SSE-NEXT: pand %xmm7, %xmm3
8454 ; SSE-NEXT: por %xmm1, %xmm3
8455 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3]
8456 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8457 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8458 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8459 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
8460 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8461 ; SSE-NEXT: movdqa %xmm3, %xmm1
8462 ; SSE-NEXT: pand %xmm7, %xmm1
8463 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8464 ; SSE-NEXT: pandn %xmm5, %xmm7
8465 ; SSE-NEXT: por %xmm1, %xmm7
8466 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8467 ; SSE-NEXT: movdqa %xmm4, %xmm1
8468 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8469 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
8470 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8471 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8472 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8473 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
8474 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
8475 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
8476 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8477 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8478 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
8479 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2]
8480 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8481 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
8482 ; SSE-NEXT: movdqa %xmm4, %xmm1
8483 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8485 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8486 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8487 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8488 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8489 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8490 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7]
8491 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
8492 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3]
8493 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2]
8494 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8495 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
8496 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
8497 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8498 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8499 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8500 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8501 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8502 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8503 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8504 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8505 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8506 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8507 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8508 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8509 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8510 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8511 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8512 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8513 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
8514 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8515 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8516 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
8517 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8518 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8519 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8520 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8521 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8522 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8523 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8524 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8525 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8526 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8527 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8528 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8529 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8530 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8531 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8532 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8533 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2]
8534 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8535 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8536 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
8537 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8538 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8539 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8540 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8541 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8542 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8543 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8544 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8545 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8546 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8547 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8548 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8549 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8550 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8551 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8552 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8553 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
8554 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8555 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8556 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
8557 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8558 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8559 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8560 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8561 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8562 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8563 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8564 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8565 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8566 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8567 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8568 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8569 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8570 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8571 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8572 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8573 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2]
8574 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8575 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
8576 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
8577 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8578 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8579 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8580 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8581 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8582 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8583 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8584 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8585 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8586 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8587 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8588 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8589 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8590 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8591 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8592 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8593 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
8594 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8595 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
8596 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
8597 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8598 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8599 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8600 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8601 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8602 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
8603 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8604 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8605 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
8606 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8607 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8608 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
8609 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8610 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
8611 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8612 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8613 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
8614 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8615 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8616 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
8617 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
8618 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8619 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8620 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8621 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8622 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8623 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
8624 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
8625 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8626 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8627 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8628 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8629 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8630 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8631 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
8632 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
8633 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7]
8634 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8635 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
8636 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8637 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
8638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8639 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
8640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8641 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
8642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8643 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
8644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8645 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
8646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8647 ; SSE-NEXT: movaps %xmm1, (%rsi)
8648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8649 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
8650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8651 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
8652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8653 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
8654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8655 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
8656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8657 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
8658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8659 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
8660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8661 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
8662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8663 ; SSE-NEXT: movaps %xmm1, (%rdx)
8664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8665 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
8666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8667 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
8668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8669 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
8670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8671 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
8672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8673 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
8674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8675 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
8676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8677 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
8678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8679 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
8680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8681 ; SSE-NEXT: movaps %xmm1, (%rcx)
8682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8683 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
8684 ; SSE-NEXT: movdqa %xmm14, 112(%r8)
8685 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8686 ; SSE-NEXT: movaps %xmm1, 96(%r8)
8687 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8688 ; SSE-NEXT: movaps %xmm1, 80(%r8)
8689 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8690 ; SSE-NEXT: movaps %xmm1, 64(%r8)
8691 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8692 ; SSE-NEXT: movaps %xmm1, 48(%r8)
8693 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8694 ; SSE-NEXT: movaps %xmm1, 32(%r8)
8695 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8696 ; SSE-NEXT: movaps %xmm1, 16(%r8)
8697 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8698 ; SSE-NEXT: movaps %xmm1, (%r8)
8699 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8700 ; SSE-NEXT: movaps %xmm1, 112(%r9)
8701 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8702 ; SSE-NEXT: movaps %xmm1, 96(%r9)
8703 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8704 ; SSE-NEXT: movaps %xmm1, 80(%r9)
8705 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8706 ; SSE-NEXT: movaps %xmm1, 64(%r9)
8707 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8708 ; SSE-NEXT: movaps %xmm1, 48(%r9)
8709 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8710 ; SSE-NEXT: movaps %xmm1, 32(%r9)
8711 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8712 ; SSE-NEXT: movaps %xmm1, 16(%r9)
8713 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8714 ; SSE-NEXT: movaps %xmm1, (%r9)
8715 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8716 ; SSE-NEXT: movaps %xmm11, 112(%rax)
8717 ; SSE-NEXT: movaps %xmm12, 96(%rax)
8718 ; SSE-NEXT: movaps %xmm13, 80(%rax)
8719 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8720 ; SSE-NEXT: movaps %xmm1, 64(%rax)
8721 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8722 ; SSE-NEXT: movaps %xmm1, 48(%rax)
8723 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8724 ; SSE-NEXT: movaps %xmm1, 32(%rax)
8725 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8726 ; SSE-NEXT: movaps %xmm1, 16(%rax)
8727 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8728 ; SSE-NEXT: movaps %xmm1, (%rax)
8729 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8730 ; SSE-NEXT: movapd %xmm0, 112(%rax)
8731 ; SSE-NEXT: movapd %xmm4, 96(%rax)
8732 ; SSE-NEXT: movapd %xmm5, 80(%rax)
8733 ; SSE-NEXT: movapd %xmm6, 64(%rax)
8734 ; SSE-NEXT: movapd %xmm7, 48(%rax)
8735 ; SSE-NEXT: movapd %xmm8, 32(%rax)
8736 ; SSE-NEXT: movapd %xmm9, 16(%rax)
8737 ; SSE-NEXT: movapd %xmm10, (%rax)
8738 ; SSE-NEXT: addq $1352, %rsp # imm = 0x548
8741 ; AVX1-ONLY-LABEL: load_i16_stride7_vf64:
8742 ; AVX1-ONLY: # %bb.0:
8743 ; AVX1-ONLY-NEXT: subq $1544, %rsp # imm = 0x608
8744 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0
8745 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8746 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0
8747 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1
8748 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8749 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
8750 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8751 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
8752 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8753 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
8754 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
8755 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8756 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8757 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8758 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2
8759 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8760 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1
8761 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8762 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8763 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8764 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
8765 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
8766 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7
8767 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0
8768 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8769 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
8770 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8771 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
8772 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8773 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
8774 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
8775 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
8776 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8777 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
8778 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
8779 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8780 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3
8781 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8782 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0
8783 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8784 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8785 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
8786 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
8787 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8788 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
8789 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8790 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6
8791 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero
8792 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8793 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
8794 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8795 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
8796 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
8797 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
8798 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8799 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
8800 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8801 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1
8802 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8803 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
8804 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2
8805 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
8806 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
8807 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8808 ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2
8809 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8810 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
8811 ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm3
8812 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8813 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8814 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
8815 ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2
8816 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8817 ; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3
8818 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8819 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8820 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8821 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8822 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8823 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm2
8824 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8825 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
8826 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8827 ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm3
8828 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8829 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
8830 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8831 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5
8832 ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15
8833 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero
8834 ; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8835 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8836 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
8837 ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4
8838 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8839 ; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3
8840 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8841 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
8842 ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4
8843 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8844 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
8845 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
8846 ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm4
8847 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8848 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
8849 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
8850 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
8851 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
8852 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
8853 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
8854 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8855 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
8856 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8857 ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1
8858 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8859 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
8860 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2
8861 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8862 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
8863 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8864 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2
8865 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8866 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
8867 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3
8868 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8869 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8870 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
8871 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3
8872 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8873 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2
8874 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8875 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8876 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8877 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8878 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8879 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10
8880 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3]
8881 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8882 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8883 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12
8884 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3]
8885 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8886 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8887 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9
8888 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3
8889 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8890 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero
8891 ; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
8893 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4
8894 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8895 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3
8896 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8897 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
8898 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4
8899 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8900 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
8901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
8902 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11
8903 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
8904 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8905 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
8906 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
8907 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
8908 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
8909 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
8910 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8911 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
8912 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8913 ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1
8914 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8915 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
8916 ; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2
8917 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8918 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
8919 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8920 ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2
8921 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8922 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
8923 ; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3
8924 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8925 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8926 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
8927 ; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2
8928 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8929 ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13
8930 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
8931 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8932 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8933 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8934 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8935 ; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm2
8936 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8937 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
8938 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8939 ; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3
8940 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8941 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
8942 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8943 ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14
8944 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3
8945 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8946 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero
8947 ; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8948 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
8949 ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4
8950 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8951 ; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3
8952 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8953 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
8954 ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4
8955 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8956 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
8957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
8958 ; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm4
8959 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8960 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
8961 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
8962 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
8963 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
8964 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0
8965 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
8966 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8967 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
8968 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8969 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8970 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8971 ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8972 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
8973 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8974 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8975 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8976 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6],mem[7]
8977 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
8978 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
8979 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8980 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8981 ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1
8982 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8983 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8984 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
8985 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1
8986 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8987 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8988 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
8989 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8990 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
8991 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
8992 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8993 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
8994 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
8995 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8996 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8997 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
8998 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
8999 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4
9000 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9001 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload
9002 ; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7]
9003 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9004 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
9005 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
9006 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9007 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2
9008 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3
9009 ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8
9010 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
9011 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
9012 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9013 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9014 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9015 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0
9016 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9017 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
9018 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9019 ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9020 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9021 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9022 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9023 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9024 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
9025 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9026 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9027 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9028 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9029 ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3
9030 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9031 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
9032 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9033 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
9034 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1
9035 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
9036 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9037 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9038 ; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
9039 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9040 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9041 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9042 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9043 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9044 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9045 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9046 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9047 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9048 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
9049 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9050 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9051 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5
9052 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9053 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3
9054 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
9055 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9056 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9057 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9058 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9059 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0
9060 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9061 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9062 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9063 ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9064 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9065 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9066 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9067 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9068 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7]
9069 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9070 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9071 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9072 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9073 ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3
9074 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9075 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
9076 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9077 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload
9078 ; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
9079 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
9080 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15
9081 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7]
9082 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9083 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9084 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9085 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9086 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9087 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9088 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9089 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9090 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9091 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
9092 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9093 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5
9094 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9095 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3
9096 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
9097 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9098 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9099 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9100 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9101 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0
9102 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9103 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9104 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9105 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
9106 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9107 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9108 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9109 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9110 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7]
9111 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9112 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9113 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9114 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9115 ; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3
9116 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
9117 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9118 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9119 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
9120 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3
9121 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9122 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9123 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7]
9124 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9125 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9126 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9127 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9128 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4
9129 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9130 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9131 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9132 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9133 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
9134 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9135 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9136 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5
9137 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9138 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9139 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
9140 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4
9141 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9142 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9143 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9144 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9145 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
9146 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9147 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9148 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm2
9149 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9150 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
9151 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9152 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3]
9153 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
9154 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9155 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3],mem[4,5,6,7]
9156 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9157 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9158 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9159 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9160 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
9161 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9162 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9163 ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2]
9164 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9165 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7]
9166 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9167 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1]
9168 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7]
9169 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9170 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
9171 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4
9172 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9173 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3]
9174 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9175 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9176 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3]
9177 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1]
9178 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
9179 ; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3]
9180 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
9181 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9182 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
9183 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9184 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
9185 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5
9186 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
9187 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9188 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9189 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9190 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3
9191 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9192 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9193 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3
9194 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
9195 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
9196 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9197 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3]
9198 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
9199 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9200 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
9201 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
9202 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9203 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9204 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9205 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9206 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
9207 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9208 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
9209 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9210 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9211 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3]
9212 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1]
9213 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9214 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9215 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9216 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3]
9217 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9218 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
9219 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9220 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2]
9221 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
9222 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7]
9223 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9224 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1]
9225 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
9226 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9227 ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1]
9228 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9229 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9230 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5
9231 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6
9232 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5
9233 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9234 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9235 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3
9236 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5
9237 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3
9238 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9239 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3
9240 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
9241 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3]
9242 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9243 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7]
9244 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
9245 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9246 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9247 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
9248 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
9249 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9250 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3]
9251 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9252 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3]
9253 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
9254 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
9255 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9256 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
9257 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
9258 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7]
9259 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2]
9260 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7]
9261 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9262 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1]
9263 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7]
9264 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9265 ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1]
9266 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
9267 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9268 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5
9269 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7
9270 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5
9271 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9272 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9273 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3
9274 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5
9275 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0
9276 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9277 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9278 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3
9279 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9280 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
9281 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9282 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3]
9283 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9284 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9285 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
9286 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
9287 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9288 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9289 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9290 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
9291 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7]
9292 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9293 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3]
9294 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
9295 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9296 ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3]
9297 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1]
9298 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9299 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9300 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9301 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3]
9302 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
9303 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7]
9304 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9305 ; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2]
9306 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9307 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7]
9308 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9309 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1]
9310 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
9311 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9312 ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1]
9313 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9314 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7
9315 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14
9316 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7
9317 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
9318 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5
9319 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7
9320 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5
9321 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9322 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9323 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9324 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9325 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
9326 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7]
9327 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9328 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
9329 ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7]
9330 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7]
9331 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7]
9332 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7]
9333 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9334 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
9335 ; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
9336 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9337 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7]
9338 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9339 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
9340 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
9341 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
9342 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9343 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9344 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7]
9345 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
9346 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
9347 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7]
9348 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9349 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9350 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9351 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
9352 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
9353 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
9354 ; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9355 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
9356 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15
9357 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9358 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12
9359 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12
9360 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2
9361 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12
9362 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2
9363 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12
9364 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12
9365 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2
9366 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2
9367 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9368 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9369 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9370 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
9371 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
9372 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7]
9373 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
9374 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload
9375 ; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7]
9376 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9377 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9378 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7]
9379 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9380 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9381 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9382 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7]
9383 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9384 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9385 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
9386 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
9387 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9388 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload
9389 ; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7]
9390 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9391 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9392 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7]
9393 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9394 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9395 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9396 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9397 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9398 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9399 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9400 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14
9401 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9402 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15
9403 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9404 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9405 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4
9406 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14
9407 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4
9408 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
9409 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9410 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12
9411 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4
9412 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4
9413 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9414 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
9415 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
9416 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
9417 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
9418 ; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7]
9419 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9420 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9421 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7]
9422 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9423 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
9424 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
9425 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9426 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9427 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9428 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9429 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9430 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9431 ; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
9432 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9433 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9434 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7]
9435 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
9436 ; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9437 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9438 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9439 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
9440 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14
9441 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9442 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15
9443 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9444 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9445 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3
9446 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14
9447 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3
9448 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
9449 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9450 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12
9451 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3
9452 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm0
9453 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9454 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9455 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9456 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
9457 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
9458 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7]
9459 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9460 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
9461 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
9462 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9463 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9464 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7]
9465 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9466 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9467 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9468 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7]
9469 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9470 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9471 ; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9472 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7]
9473 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
9474 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9475 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9476 ; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
9477 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9478 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9479 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7]
9480 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9481 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9482 ; AVX1-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9483 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9484 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9485 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9486 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9487 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5
9488 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9489 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14
9490 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
9491 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6
9492 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1
9493 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1
9494 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5
9495 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5
9496 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1
9497 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0
9498 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9499 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
9500 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
9501 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9502 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
9503 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9504 ; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
9505 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
9506 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7]
9507 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9508 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9509 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
9510 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9511 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1
9512 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9513 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3]
9514 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
9515 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5
9516 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9517 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9518 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
9519 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9520 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3]
9521 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9522 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9523 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9524 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
9525 ; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9526 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
9527 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
9528 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9529 ; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3]
9530 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
9531 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
9532 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9533 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2
9534 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5
9535 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0
9536 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9537 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9538 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9539 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9540 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7]
9541 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9542 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3]
9543 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9544 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9545 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
9546 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
9547 ; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9548 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1]
9549 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9550 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7]
9551 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9552 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5
9553 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9554 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3]
9555 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9556 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9557 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6
9558 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9559 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
9560 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
9561 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9562 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3]
9563 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
9564 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9565 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
9566 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9567 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
9568 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
9569 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9570 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3]
9571 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9572 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
9573 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9574 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9575 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9576 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5
9577 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2
9578 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9579 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9580 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9581 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9582 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
9583 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9584 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
9585 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9586 ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9587 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
9588 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
9589 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
9590 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9591 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
9592 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9593 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4
9594 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9595 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3]
9596 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9597 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9598 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5
9599 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9600 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9601 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
9602 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9603 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3]
9604 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9605 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9606 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
9607 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9608 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9609 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
9610 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9611 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3]
9612 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
9613 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
9614 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9615 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9616 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9617 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
9618 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9619 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9620 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9621 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9622 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
9623 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9624 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
9625 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
9626 ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9627 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
9628 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
9629 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
9630 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
9631 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9632 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9633 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3
9634 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9635 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
9636 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9637 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9638 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4
9639 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9640 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9641 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
9642 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9643 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
9644 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9645 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9646 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9647 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9648 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
9649 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
9650 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9651 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
9652 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
9653 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
9654 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9655 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9656 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1
9657 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
9658 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9659 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1
9660 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9661 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9662 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9663 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
9664 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
9665 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
9666 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
9667 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9668 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9669 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
9670 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
9671 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
9672 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9673 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9674 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
9675 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9676 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9677 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9678 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
9679 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9680 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9681 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
9682 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
9683 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9684 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
9685 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
9686 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
9687 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
9688 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9689 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9690 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
9691 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9692 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1
9693 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9694 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
9695 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9696 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9697 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1
9698 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9699 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9700 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9701 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
9702 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9703 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9704 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
9705 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
9706 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9707 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9708 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
9709 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
9710 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
9711 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9712 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9713 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
9714 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9715 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9716 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9717 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9718 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
9719 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9720 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9721 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
9722 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9723 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9724 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7]
9725 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9726 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
9727 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9728 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9729 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
9730 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
9731 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9732 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9733 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9734 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
9735 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9736 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1
9737 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9738 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
9739 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9740 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9741 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2
9742 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9743 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9744 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9745 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9746 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9747 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9748 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
9749 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
9750 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9751 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9752 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7]
9753 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9754 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
9755 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
9756 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9757 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3
9758 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9759 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9760 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9761 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9762 ; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9763 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
9764 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
9765 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
9766 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9767 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9768 ; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7]
9769 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
9770 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
9771 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9772 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
9773 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
9774 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
9775 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9776 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9777 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9778 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
9779 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9780 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9781 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9782 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1
9783 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9784 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9785 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
9786 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
9787 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9788 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4
9789 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9790 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9791 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9792 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9793 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9794 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9795 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7]
9796 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
9797 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
9798 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
9799 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9800 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9801 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9802 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
9803 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9804 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9805 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9806 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
9807 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
9808 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
9809 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9810 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9811 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7]
9812 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
9813 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7]
9814 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9815 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9816 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
9817 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
9818 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9819 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9820 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
9821 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
9822 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9823 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9824 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9825 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3
9826 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9827 ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
9828 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9829 ; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
9830 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9831 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
9832 ; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
9833 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
9834 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9835 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
9836 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7]
9837 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
9838 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9839 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
9840 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9841 ; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1]
9842 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9843 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9844 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9845 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9846 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
9847 ; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9848 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
9849 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9850 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
9851 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9852 ; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
9853 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9854 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9855 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9856 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3]
9857 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
9858 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
9859 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9860 ; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3]
9861 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9862 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
9863 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9864 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9865 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4
9866 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5
9867 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
9868 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
9869 ; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero
9870 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
9871 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
9872 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7]
9873 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
9874 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
9875 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
9876 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
9877 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9878 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9879 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9880 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
9881 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
9882 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
9883 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
9884 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
9885 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
9886 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9887 ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3]
9888 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
9889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
9890 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3]
9891 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
9892 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7]
9893 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
9894 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5
9895 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6
9896 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5
9897 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9898 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
9899 ; AVX1-ONLY-NEXT: # xmm6 = zero,xmm0[1],mem[0],zero
9900 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9901 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
9902 ; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3]
9903 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
9904 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9905 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
9906 ; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
9907 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
9908 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
9909 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
9910 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9911 ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1]
9912 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9913 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9914 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
9915 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9916 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
9917 ; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9918 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
9919 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
9920 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
9921 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9922 ; AVX1-ONLY-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
9923 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9924 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
9925 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9926 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3]
9927 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
9928 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
9929 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9930 ; AVX1-ONLY-NEXT: # xmm9 = mem[3,3,3,3]
9931 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
9932 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7]
9933 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
9934 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6
9935 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7
9936 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6
9937 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9938 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
9939 ; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero
9940 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9941 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
9942 ; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
9943 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
9944 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9945 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
9946 ; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
9947 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
9948 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
9949 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
9950 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9951 ; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1]
9952 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9953 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9954 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9955 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9956 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
9957 ; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9958 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
9959 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
9960 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
9961 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9962 ; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
9963 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9964 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
9965 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9966 ; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3]
9967 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
9968 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7]
9969 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9970 ; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3]
9971 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
9972 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
9973 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
9974 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7
9975 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0
9976 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0
9977 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9978 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi)
9979 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9980 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi)
9981 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9982 ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi)
9983 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9984 ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi)
9985 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9986 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx)
9987 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9988 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
9989 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9990 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
9991 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9992 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
9993 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9994 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx)
9995 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9996 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx)
9997 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9998 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx)
9999 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10000 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx)
10001 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10002 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8)
10003 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10004 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
10005 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10006 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8)
10007 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10008 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8)
10009 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10010 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9)
10011 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10012 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9)
10013 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10014 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9)
10015 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10016 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9)
10017 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
10018 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10019 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax)
10020 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10021 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax)
10022 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10023 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax)
10024 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10025 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax)
10026 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
10027 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax)
10028 ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax)
10029 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax)
10030 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax)
10031 ; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608
10032 ; AVX1-ONLY-NEXT: vzeroupper
10033 ; AVX1-ONLY-NEXT: retq
10035 ; AVX2-SLOW-LABEL: load_i16_stride7_vf64:
10036 ; AVX2-SLOW: # %bb.0:
10037 ; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8
10038 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13
10039 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14
10040 ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15
10041 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12
10042 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9
10043 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10
10044 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
10045 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10046 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
10047 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10048 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0
10049 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10050 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
10051 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10052 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
10053 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
10054 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
10055 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10056 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
10057 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10058 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
10059 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
10060 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4
10061 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10062 ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
10063 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10064 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7]
10065 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10066 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10067 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10068 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10069 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10070 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
10071 ; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10072 ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10073 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
10074 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10075 ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0
10076 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8
10077 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
10078 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
10079 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10080 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10081 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10082 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10083 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10084 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7
10085 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1
10086 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
10087 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10088 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10089 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
10090 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10091 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0
10092 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10093 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0
10094 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10095 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8
10096 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7]
10097 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10098 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10099 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10100 ; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11
10101 ; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5
10102 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7]
10103 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0
10104 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
10105 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10106 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10107 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
10108 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10109 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7]
10110 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
10111 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
10112 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
10113 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10114 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
10115 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
10116 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5
10117 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
10118 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
10119 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
10120 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9
10121 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10122 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
10123 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10124 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10125 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
10126 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10127 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
10128 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
10129 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
10130 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4
10131 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10132 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15
10133 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
10134 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14
10135 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10136 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10137 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10138 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10139 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
10140 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13
10141 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10142 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10143 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
10144 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
10145 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
10146 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7
10147 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0
10148 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10149 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10150 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10151 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7]
10152 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10153 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10154 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
10155 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10156 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10157 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
10158 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10159 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
10160 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2
10161 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0
10162 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10163 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
10164 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
10165 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
10166 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7]
10167 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10
10168 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
10169 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
10170 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
10171 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5
10172 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
10173 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10174 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0
10175 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10176 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10177 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10178 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7]
10179 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10180 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10181 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10182 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10183 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
10184 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1]
10185 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
10186 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10187 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10188 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10189 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
10190 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10191 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
10192 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10193 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10194 ; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10195 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
10196 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1]
10197 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
10198 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10199 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10200 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
10201 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10202 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10203 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10204 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
10205 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10206 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10207 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10208 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10209 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10210 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
10211 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
10212 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
10213 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10214 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5
10215 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
10216 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10217 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
10218 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
10219 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
10220 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10221 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10222 ; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
10223 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
10224 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
10225 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10226 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10227 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10228 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
10229 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10230 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
10231 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10232 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10233 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10234 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10235 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
10236 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10237 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10238 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
10239 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10240 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
10241 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10242 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10243 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10244 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10245 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
10246 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10247 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10248 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
10249 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10250 ; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
10251 ; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
10252 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10253 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10254 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10255 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
10256 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7]
10257 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10258 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10259 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10260 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10261 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0
10262 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10263 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0
10264 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2
10265 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
10266 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10
10267 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12
10268 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7]
10269 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
10270 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
10271 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
10272 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
10273 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10274 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
10275 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10276 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2]
10277 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5]
10278 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10279 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
10280 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
10281 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
10282 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10283 ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13
10284 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11
10285 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
10286 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
10287 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
10288 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
10289 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
10290 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
10291 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10292 ; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1
10293 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10294 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2]
10295 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5]
10296 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10297 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
10298 ; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
10299 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10300 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10301 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5
10302 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6
10303 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
10304 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10305 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10306 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7]
10307 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
10308 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
10309 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
10310 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
10311 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
10312 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14
10313 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2]
10314 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5]
10315 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10316 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7]
10317 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10318 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
10319 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10320 ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7
10321 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1
10322 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7]
10323 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9
10324 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7]
10325 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
10326 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10327 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
10328 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
10329 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10330 ; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2
10331 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2]
10332 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5]
10333 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10334 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
10335 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10336 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10337 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10338 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7]
10339 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15
10340 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7]
10341 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
10342 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10343 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10344 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7]
10345 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10346 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
10347 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10348 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
10349 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10350 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10351 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
10352 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
10353 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
10354 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10355 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10356 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
10357 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10358 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
10359 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10360 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
10361 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10362 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10363 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7]
10364 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
10365 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
10366 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10367 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10368 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
10369 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10370 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10371 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10372 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10373 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10374 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10375 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
10376 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10377 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
10378 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10379 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10380 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7]
10381 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10382 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10383 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10384 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10385 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10386 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10387 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
10388 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10389 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
10390 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
10391 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10392 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10393 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10394 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2]
10395 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10396 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10397 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10398 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10399 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10400 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10401 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10402 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6
10403 ; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4
10404 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
10405 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10406 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10407 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10408 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10409 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10410 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2]
10411 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10412 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10413 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10414 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10415 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10416 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10417 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10418 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7]
10419 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5
10420 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10421 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
10422 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10423 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10424 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10425 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10426 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2]
10427 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10428 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10429 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10430 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10431 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10432 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10433 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10434 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10435 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10436 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
10437 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10438 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10439 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
10440 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10441 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2]
10442 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
10443 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10444 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10445 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10446 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10447 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10448 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10449 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
10450 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10451 ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10452 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10453 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10454 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10455 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10456 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10457 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3]
10458 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10459 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10460 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10461 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10462 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10463 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10464 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10465 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10466 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
10467 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8
10468 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10469 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10470 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10471 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10472 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10473 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10474 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10475 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3]
10476 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4]
10477 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10478 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10479 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10480 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10481 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10482 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10483 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
10484 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10485 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10486 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10487 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10488 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10489 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3]
10490 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
10491 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10492 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
10493 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10494 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
10495 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10496 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10497 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
10498 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
10499 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
10500 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10501 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10502 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
10503 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3]
10504 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
10505 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10506 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
10507 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10508 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
10509 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
10510 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10511 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10512 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10513 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
10514 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
10515 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
10516 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
10517 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
10518 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10519 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10520 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10521 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
10522 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
10523 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5
10524 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
10525 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15]
10526 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
10527 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
10528 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
10529 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7]
10530 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
10531 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
10532 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10533 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5]
10534 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10535 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
10536 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
10537 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10538 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10539 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
10540 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10541 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
10542 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10543 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
10544 ; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10545 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5]
10546 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10547 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
10548 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10549 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
10550 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
10551 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10552 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
10553 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
10554 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
10555 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
10556 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10557 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10558 ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7]
10559 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7
10560 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
10561 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
10562 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
10563 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
10564 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
10565 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10566 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10567 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
10568 ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
10569 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10570 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
10571 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
10572 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
10573 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
10574 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10575 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10576 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
10577 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6
10578 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
10579 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
10580 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
10581 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10582 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8
10583 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
10584 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
10585 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
10586 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
10587 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
10588 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5]
10589 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10590 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
10591 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
10592 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
10593 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10594 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10595 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10596 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10597 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4
10598 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
10599 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
10600 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
10601 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
10602 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10603 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
10604 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7]
10605 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10606 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
10607 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15]
10608 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
10609 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10610 ; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload
10611 ; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
10612 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10613 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
10614 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10615 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
10616 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
10617 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10618 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
10619 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
10620 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10621 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10622 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
10623 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10624 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10625 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10626 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10627 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10628 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
10629 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
10630 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
10631 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10632 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10633 ; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13
10634 ; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14
10635 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
10636 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
10637 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
10638 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10639 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
10640 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10641 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10642 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
10643 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
10644 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
10645 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
10646 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
10647 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
10648 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
10649 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
10650 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10651 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
10652 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
10653 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10654 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10655 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
10656 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10657 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10658 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
10659 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4
10660 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
10661 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10662 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10663 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4
10664 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10665 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5
10666 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10667 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
10668 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
10669 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
10670 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
10671 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15]
10672 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
10673 ; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
10674 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10675 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
10676 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
10677 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
10678 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
10679 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
10680 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10681 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
10682 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
10683 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10684 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10685 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10686 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
10687 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10688 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10689 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
10690 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
10691 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10692 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
10693 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
10694 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
10695 ; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1
10696 ; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0
10697 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
10698 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
10699 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
10700 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10701 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
10702 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10703 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
10704 ; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
10705 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15
10706 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
10707 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
10708 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
10709 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
10710 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
10711 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10712 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
10713 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
10714 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
10715 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10717 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
10718 ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
10719 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10720 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
10721 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3
10722 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
10723 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15
10724 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3
10725 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
10726 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
10727 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
10728 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10729 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
10730 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10731 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10732 ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
10733 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
10734 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15]
10735 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2
10736 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10737 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10738 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
10739 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
10740 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
10741 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
10742 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
10743 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
10744 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10745 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
10746 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
10747 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
10748 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10749 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10750 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10751 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
10752 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
10753 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15]
10754 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10755 ; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10756 ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
10757 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5
10758 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
10759 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
10760 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
10761 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
10762 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10763 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
10764 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
10765 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10766 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
10767 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10768 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10769 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
10770 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7]
10771 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
10772 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10773 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
10774 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
10775 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
10776 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6
10777 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15]
10778 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
10779 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
10780 ; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
10781 ; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7]
10782 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
10783 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15]
10784 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
10785 ; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
10786 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
10787 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
10788 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
10789 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1
10790 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10791 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
10792 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10793 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10794 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
10795 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
10796 ; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
10797 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7]
10798 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
10799 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
10800 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
10801 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
10802 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7
10803 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
10804 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
10805 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10806 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10807 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10808 ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7]
10809 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
10810 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15]
10811 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10812 ; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
10813 ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
10814 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9
10815 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
10816 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10817 ; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10818 ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
10819 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7
10820 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10821 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
10822 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
10823 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
10824 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15]
10825 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10826 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10827 ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
10828 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7]
10829 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
10830 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
10831 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
10832 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
10833 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
10834 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
10835 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
10836 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
10837 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10838 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
10839 ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
10840 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7
10841 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
10842 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
10843 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
10844 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
10845 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
10846 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
10847 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
10848 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
10849 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10850 ; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
10851 ; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7]
10852 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10853 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
10854 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10855 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
10856 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
10857 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
10858 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
10859 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
10860 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
10861 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
10862 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
10863 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
10864 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10865 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi)
10866 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10867 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi)
10868 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10869 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi)
10870 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10871 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi)
10872 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10873 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx)
10874 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10875 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx)
10876 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10877 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx)
10878 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10879 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx)
10880 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10881 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx)
10882 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10883 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx)
10884 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10885 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx)
10886 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10887 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx)
10888 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10889 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8)
10890 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10891 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8)
10892 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10893 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8)
10894 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10895 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8)
10896 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10897 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9)
10898 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10899 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9)
10900 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10901 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9)
10902 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10903 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9)
10904 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
10905 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10906 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax)
10907 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10908 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax)
10909 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10910 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax)
10911 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10912 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax)
10913 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
10914 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax)
10915 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax)
10916 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax)
10917 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax)
10918 ; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8
10919 ; AVX2-SLOW-NEXT: vzeroupper
10920 ; AVX2-SLOW-NEXT: retq
10922 ; AVX2-FAST-LABEL: load_i16_stride7_vf64:
10923 ; AVX2-FAST: # %bb.0:
10924 ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608
10925 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6
10926 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7
10927 ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14
10928 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15
10929 ; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10930 ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12
10931 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10932 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13
10933 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10934 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2
10935 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9
10936 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
10937 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0
10938 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10939 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
10940 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11
10941 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
10942 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
10943 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
10944 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
10945 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
10946 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
10947 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
10948 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
10949 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2
10950 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10951 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
10952 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10953 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
10954 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
10955 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
10956 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
10957 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
10958 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10959 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
10960 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
10961 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0
10962 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10963 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
10964 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15
10965 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10966 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13
10967 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10968 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
10969 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
10970 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3
10971 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1
10972 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12
10973 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7]
10974 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10975 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10976 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
10977 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6
10978 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0
10979 ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
10980 ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2
10981 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10982 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0
10983 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10984 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
10985 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
10986 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
10987 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5
10988 ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2
10989 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0
10990 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10991 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
10992 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3
10993 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10994 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
10995 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4
10996 ; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0
10997 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2
10998 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10999 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2
11000 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11001 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11002 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
11003 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11004 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11005 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11006 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11007 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
11008 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,5,1,u,4,u,u,u]
11009 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6
11010 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
11011 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8
11012 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
11013 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11014 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11015 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11016 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11017 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11018 ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7]
11019 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11020 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11021 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11022 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
11023 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
11024 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11025 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11026 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11027 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14
11028 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11029 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
11030 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11031 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11032 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11033 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
11034 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11035 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11036 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11037 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11038 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11039 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11040 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
11041 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11042 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11043 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6
11044 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11045 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
11046 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4
11047 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
11048 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3
11049 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11050 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7]
11051 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11052 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
11053 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
11054 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,1,u,5,u,u,u]
11055 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6
11056 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
11057 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8
11058 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
11059 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11060 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2
11061 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11062 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11063 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11064 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7]
11065 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11066 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11067 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11068 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11069 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11070 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
11071 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11072 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11073 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1
11074 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11075 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1
11076 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
11077 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11078 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11079 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11080 ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11
11081 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11082 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
11083 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11084 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11085 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7
11086 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11087 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11088 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11089 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7]
11090 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11091 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11092 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6
11093 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11094 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11095 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
11096 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4
11097 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
11098 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11099 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
11100 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11101 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11102 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11103 ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
11104 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11105 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
11106 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11107 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11108 ; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
11109 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3]
11110 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
11111 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7
11112 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
11113 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11114 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
11115 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11116 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
11117 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11118 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11119 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11120 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
11121 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
11122 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7
11123 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3
11124 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11125 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7]
11126 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11127 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11128 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11129 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
11130 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
11131 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7
11132 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2
11133 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
11135 ; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
11136 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11137 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11138 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5
11139 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7]
11140 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3]
11141 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4
11142 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0
11143 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11144 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10
11145 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1
11146 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7]
11147 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11
11148 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5]
11149 ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
11150 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4
11151 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
11152 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6
11153 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
11154 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11155 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
11156 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
11157 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7
11158 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14
11159 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
11160 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
11161 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7]
11162 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11163 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12
11164 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7
11165 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7]
11166 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0
11167 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8
11168 ; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
11169 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11170 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
11171 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2
11172 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7]
11173 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
11174 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
11175 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11176 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0
11177 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6
11178 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
11179 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15
11180 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11181 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2
11182 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8
11183 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0
11184 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11185 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
11186 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9
11187 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13
11188 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
11189 ; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload
11190 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7]
11191 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11192 ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0
11193 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8
11194 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7]
11195 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9
11196 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5
11197 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5
11198 ; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0
11199 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11200 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
11201 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14
11202 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7]
11203 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
11204 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7]
11205 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11206 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13
11207 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
11208 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14
11209 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7]
11210 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
11211 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5
11212 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
11213 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
11214 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
11215 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
11216 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11217 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
11218 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
11219 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11220 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7]
11221 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11222 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
11223 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4
11224 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11225 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
11226 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
11227 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11228 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15]
11229 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
11230 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11231 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7]
11232 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
11233 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
11234 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
11235 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11236 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
11237 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11238 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11239 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11240 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11241 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11242 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15
11243 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
11244 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11245 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
11246 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
11247 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0
11248 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11249 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11250 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11251 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11252 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11253 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11254 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
11255 ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14
11256 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11257 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11258 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
11259 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
11260 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
11261 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11262 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11263 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
11264 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11265 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3
11266 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11267 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11268 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11269 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11270 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11271 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11272 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5
11273 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10
11274 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
11275 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11276 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11277 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11278 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11279 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11280 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm3
11281 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11282 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11283 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11284 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11285 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11286 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11287 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
11288 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11289 ; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
11290 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11291 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11292 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11293 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11294 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11295 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3
11296 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11297 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11298 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11299 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11300 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11301 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11302 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11303 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11304 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
11305 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11306 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11307 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11308 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11309 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11310 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1
11311 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11312 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11313 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11314 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11315 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11316 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11317 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
11318 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
11319 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
11320 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
11321 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11322 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
11323 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
11324 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11325 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
11326 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2
11327 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11328 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11329 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11330 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11331 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11332 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
11333 ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14
11334 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11335 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0
11336 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11337 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11338 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
11339 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11340 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11341 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3]
11342 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5
11343 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
11344 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11345 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
11346 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
11347 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11348 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
11349 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
11350 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
11351 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11352 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11353 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3]
11354 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11355 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5
11356 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
11357 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11358 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
11359 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
11360 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11361 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7]
11362 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
11363 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
11364 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11365 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11366 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3]
11367 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11368 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11369 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11370 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11371 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
11372 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11373 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11374 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11375 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11376 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11377 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11378 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3
11379 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10
11380 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
11381 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
11382 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
11383 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
11384 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
11385 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7]
11386 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
11387 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4
11388 ; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1
11389 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11390 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
11391 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
11392 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7]
11393 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11394 ; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11395 ; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
11396 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,3,7,2,6,u,u,u]
11397 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5
11398 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
11399 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11400 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
11401 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
11402 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
11403 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11404 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11405 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11406 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7]
11407 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4
11408 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3
11409 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
11410 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11411 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11412 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11413 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
11414 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11415 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
11416 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4
11417 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10
11418 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11419 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11420 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0
11421 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
11422 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11423 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11424 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
11425 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4
11426 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11427 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
11428 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
11429 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11430 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11431 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11432 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11433 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11434 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11435 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
11436 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
11437 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
11438 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
11439 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11440 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
11441 ; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
11442 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11443 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11444 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3
11445 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11446 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11447 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4
11448 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
11449 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11450 ; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
11451 ; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7]
11452 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4
11453 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11454 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
11455 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
11456 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
11457 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
11458 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11459 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11460 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11461 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11462 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
11463 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
11464 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
11465 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
11466 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11467 ; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11468 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7]
11469 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2
11470 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11471 ; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
11472 ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
11473 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11474 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11475 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
11476 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11477 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3
11478 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11479 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11480 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
11481 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
11482 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11483 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11484 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11485 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7]
11486 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11487 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
11488 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
11489 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4]
11490 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
11491 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
11492 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
11493 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
11494 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
11495 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11496 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11497 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11498 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
11499 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11500 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
11501 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u]
11502 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0
11503 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5
11504 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
11505 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15]
11506 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
11507 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
11508 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11509 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11510 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
11511 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0
11512 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11513 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
11514 ; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
11515 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12
11516 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
11517 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
11518 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
11519 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11520 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload
11521 ; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
11522 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
11523 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13
11524 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6
11525 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
11526 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
11527 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
11528 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11529 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11530 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
11531 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4
11532 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13
11533 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
11534 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13
11535 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
11536 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11537 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
11538 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11539 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11540 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0
11541 ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
11542 ; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7]
11543 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12
11544 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
11545 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
11546 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11547 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
11548 ; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
11549 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
11550 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13
11551 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
11552 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
11553 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
11554 ; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
11555 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13
11556 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2
11557 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13
11558 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
11559 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11560 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
11561 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11562 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11563 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0
11564 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
11565 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11566 ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
11567 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
11568 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3
11569 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
11570 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
11571 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11572 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11573 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
11574 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2
11575 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1
11576 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11577 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11578 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
11579 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
11580 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
11581 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
11582 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11583 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11584 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
11585 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11586 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11587 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
11588 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
11589 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
11590 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
11591 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
11592 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
11593 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5
11594 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
11595 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
11596 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
11597 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
11598 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15]
11599 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
11600 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11601 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
11602 ; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7]
11603 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10
11604 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
11605 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10
11606 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
11607 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
11608 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,0,3,7,u,u,u]
11609 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6
11610 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
11611 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6
11612 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
11613 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
11614 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
11615 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
11616 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7
11617 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7
11618 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6
11619 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
11620 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6
11621 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
11622 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7]
11623 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
11624 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
11625 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9
11626 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
11627 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
11628 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7
11629 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7
11630 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
11631 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
11632 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
11633 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11634 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11635 ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
11636 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
11637 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8
11638 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7
11639 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7
11640 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
11641 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11642 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11643 ; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7]
11644 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11645 ; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
11646 ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
11647 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11
11648 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11
11649 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
11650 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
11651 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8
11652 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8
11653 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
11654 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
11655 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
11656 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11657 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
11658 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11659 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11660 ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
11661 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
11662 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
11663 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
11664 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11665 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11666 ; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7]
11667 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2
11668 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2
11669 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11670 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11671 ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7]
11672 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11673 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11674 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
11675 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
11676 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11677 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
11678 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11679 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11680 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi)
11681 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11682 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi)
11683 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11684 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi)
11685 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11686 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi)
11687 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11688 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx)
11689 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11690 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx)
11691 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11692 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx)
11693 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11694 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx)
11695 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11696 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx)
11697 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11698 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx)
11699 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11700 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx)
11701 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11702 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx)
11703 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11704 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8)
11705 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11706 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8)
11707 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11708 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8)
11709 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11710 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8)
11711 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11712 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9)
11713 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11714 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9)
11715 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11716 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9)
11717 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11718 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9)
11719 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
11720 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11721 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax)
11722 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11723 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax)
11724 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11725 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax)
11726 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11727 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax)
11728 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
11729 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax)
11730 ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax)
11731 ; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax)
11732 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax)
11733 ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608
11734 ; AVX2-FAST-NEXT: vzeroupper
11735 ; AVX2-FAST-NEXT: retq
11737 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64:
11738 ; AVX2-FAST-PERLANE: # %bb.0:
11739 ; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8
11740 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13
11741 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15
11742 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11
11743 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14
11744 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9
11745 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12
11746 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
11747 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11748 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
11749 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11750 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1
11751 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
11752 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0
11753 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11754 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
11755 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
11756 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
11757 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
11758 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
11759 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
11760 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
11761 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
11762 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4
11763 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11764 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
11765 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11766 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7]
11767 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11768 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11769 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11770 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11771 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
11772 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11773 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11774 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11775 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11776 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
11777 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11778 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7]
11779 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11780 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11781 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11782 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11783 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11784 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8
11785 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7
11786 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
11787 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11788 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11789 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11790 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11791 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
11792 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11793 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0
11794 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11795 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10
11796 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7]
11797 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11798 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11799 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11800 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6
11801 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1
11802 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
11803 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11804 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
11805 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
11806 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11807 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
11808 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11809 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
11810 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
11811 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
11812 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7]
11813 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
11814 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
11815 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
11816 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11817 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
11818 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11819 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
11820 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9
11821 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11822 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
11823 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
11824 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
11825 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
11826 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
11827 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11828 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11829 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11830 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
11831 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11832 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
11833 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8
11834 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
11835 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
11836 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11837 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7]
11838 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14
11839 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11840 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
11841 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11842 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11843 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11844 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6
11845 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
11846 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11847 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11848 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
11849 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
11850 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
11851 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
11852 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11853 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11854 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11855 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
11856 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
11857 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
11858 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2
11859 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
11860 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11861 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11
11862 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
11863 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
11864 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
11865 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
11866 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10
11867 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
11868 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
11869 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
11870 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
11871 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
11872 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11873 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
11874 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11875 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11876 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11877 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
11878 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11879 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
11880 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11881 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11882 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7]
11883 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
11884 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
11885 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11886 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11887 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
11888 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11889 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7
11890 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
11891 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11892 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
11893 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11894 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11895 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
11896 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1]
11897 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
11898 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11899 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11900 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8
11901 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
11902 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11903 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11904 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11905 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7]
11906 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11907 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
11908 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11909 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11910 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11911 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7]
11912 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1]
11913 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15]
11914 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
11915 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
11916 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11917 ; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
11918 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7]
11919 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
11920 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
11921 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11922 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
11923 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
11924 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3]
11925 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
11926 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4
11927 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
11928 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11929 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
11930 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11931 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
11932 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11933 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
11934 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11935 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
11936 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
11937 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11938 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
11939 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11940 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7]
11941 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11942 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
11943 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11944 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
11945 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
11946 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11947 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
11948 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11949 ; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
11950 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
11951 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11952 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
11953 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11954 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
11955 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
11956 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
11957 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
11958 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
11959 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11960 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9
11961 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0
11962 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11963 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
11964 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
11965 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
11966 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
11967 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
11968 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
11969 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11970 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1
11971 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11972 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2]
11973 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
11974 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1
11975 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11976 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11977 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
11978 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11979 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8
11980 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7
11981 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
11982 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
11983 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
11984 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
11985 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
11986 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11987 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1
11988 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11989 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2]
11990 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1
11991 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11
11992 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11993 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11994 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
11995 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11996 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3
11997 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6
11998 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
11999 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12000 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12001 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12002 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12003 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12004 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12005 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
12006 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0
12007 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12008 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
12009 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13
12010 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
12011 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
12012 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7]
12013 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12014 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4
12015 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10
12016 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
12017 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12
12018 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13
12019 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10
12020 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2
12021 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12022 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
12023 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12024 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14
12025 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2]
12026 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12027 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11
12028 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
12029 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12030 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12031 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12032 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
12033 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11
12034 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
12035 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
12036 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12037 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12038 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
12039 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5
12040 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
12041 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12042 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
12043 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12044 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12045 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7]
12046 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5
12047 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
12048 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12049 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12050 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
12051 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12052 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12053 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12054 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12055 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12056 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7]
12057 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12058 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12059 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12060 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12061 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1
12062 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12063 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12064 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12065 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12066 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12067 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12068 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
12069 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12070 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12071 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12072 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1
12073 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12074 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12075 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12076 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12077 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12078 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12079 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
12080 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6
12081 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12082 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
12083 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
12084 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12085 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12086 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12087 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2]
12088 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
12089 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12090 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12091 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12092 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12093 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12094 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12095 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
12096 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12097 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12098 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12099 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12100 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12101 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2]
12102 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12103 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12104 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12105 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12106 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12107 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12108 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12109 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12110 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
12111 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12112 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12113 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12114 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12115 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2]
12116 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12117 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12118 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12119 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12120 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12121 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12122 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12123 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12124 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7]
12125 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12126 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12127 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12128 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12129 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2]
12130 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1
12131 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12132 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12133 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12134 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12135 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12136 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12137 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7]
12138 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14
12139 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12140 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12141 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
12142 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
12143 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12144 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12145 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3]
12146 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12147 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
12148 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5
12149 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
12150 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12151 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
12152 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
12153 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12154 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
12155 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12156 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11
12157 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12158 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5
12159 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
12160 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12161 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12162 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3]
12163 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6
12164 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
12165 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12166 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12167 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
12168 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12169 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7]
12170 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6
12171 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7]
12172 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12173 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
12174 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3]
12175 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7
12176 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12177 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12178 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
12179 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
12180 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12181 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
12182 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
12183 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
12184 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0
12185 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
12186 ; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12187 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3]
12188 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2
12189 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
12190 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12191 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15]
12192 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
12193 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12194 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12195 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12196 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
12197 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
12198 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4
12199 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12
12200 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
12201 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
12202 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
12203 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12204 ; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
12205 ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7]
12206 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
12207 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12208 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12209 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12210 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
12211 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7]
12212 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8
12213 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
12214 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12215 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12216 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12217 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
12218 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5
12219 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7]
12220 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15]
12221 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
12222 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12223 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12224 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12225 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
12226 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6
12227 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5
12228 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
12229 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12230 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
12231 ; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12232 ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7]
12233 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12234 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12235 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12236 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
12237 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7]
12238 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8
12239 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
12240 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6
12241 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12242 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12243 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3
12244 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8
12245 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
12246 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15]
12247 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
12248 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12249 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12250 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12251 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
12252 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5
12253 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14
12254 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3
12255 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
12256 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12257 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12258 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12259 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
12260 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6
12261 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
12262 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
12263 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
12264 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12265 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12266 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
12267 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7
12268 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
12269 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5
12270 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12271 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1
12272 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
12273 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12274 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
12275 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12276 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12277 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12278 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
12279 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3
12280 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1
12281 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
12282 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
12283 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12284 ; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
12285 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7]
12286 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12287 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
12288 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
12289 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
12290 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12291 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12292 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
12293 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5
12294 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
12295 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12296 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0
12297 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12298 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
12299 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12300 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12301 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12302 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
12303 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
12304 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12305 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12306 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12307 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12308 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12309 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2
12310 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
12311 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
12312 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
12313 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12314 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3
12315 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12316 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2
12317 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12318 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7]
12319 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
12320 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2
12321 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12322 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
12323 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12324 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12325 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
12326 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
12327 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
12328 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8
12329 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
12330 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12331 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
12332 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
12333 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12334 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12335 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
12336 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12337 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12338 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
12339 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12340 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12341 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
12342 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8
12343 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7]
12344 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
12345 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12346 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9
12347 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12348 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8
12349 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12350 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7]
12351 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8
12352 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12353 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
12354 ; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
12355 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
12356 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
12357 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9
12358 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11
12359 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12360 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
12361 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12362 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12363 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
12364 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12365 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12366 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12367 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
12368 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12369 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12370 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
12371 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12372 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
12373 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
12374 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
12375 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1
12376 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0
12377 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
12378 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15
12379 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
12380 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15]
12381 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12382 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
12383 ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7]
12384 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13
12385 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13
12386 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15
12387 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
12388 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8
12389 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
12390 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3]
12391 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7]
12392 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12393 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12394 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
12395 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7]
12396 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
12397 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7]
12398 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8
12399 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14
12400 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5
12401 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7]
12402 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3
12403 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12404 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12405 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15]
12406 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
12407 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
12408 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
12409 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
12410 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15]
12411 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2
12412 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12413 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12414 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
12415 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13
12416 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10
12417 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4
12418 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
12419 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
12420 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
12421 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12422 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12423 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12424 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
12425 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
12426 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
12427 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12428 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12429 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7]
12430 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
12431 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7]
12432 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12433 ; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
12434 ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
12435 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
12436 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12437 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12438 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
12439 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7
12440 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12441 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
12442 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12443 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
12444 ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7]
12445 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9
12446 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
12447 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9
12448 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
12449 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
12450 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
12451 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2
12452 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
12453 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
12454 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
12455 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
12456 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7]
12457 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
12458 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15]
12459 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12460 ; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
12461 ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
12462 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
12463 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7]
12464 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
12465 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1
12466 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12467 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
12468 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12469 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
12470 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12471 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12472 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7]
12473 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9
12474 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9
12475 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
12476 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
12477 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8
12478 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
12479 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
12480 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12481 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12482 ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12483 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7]
12484 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
12485 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15]
12486 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12487 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12488 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
12489 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
12490 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
12491 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12492 ; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
12493 ; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7]
12494 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12495 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12496 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9
12497 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
12498 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
12499 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12500 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
12501 ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
12502 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
12503 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12
12504 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
12505 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
12506 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1
12507 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15]
12508 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
12509 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
12510 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12511 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12512 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
12513 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
12514 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
12515 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4
12516 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7]
12517 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5
12518 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
12519 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12520 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
12521 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
12522 ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12523 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7]
12524 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
12525 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15]
12526 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5
12527 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
12528 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
12529 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3
12530 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7]
12531 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
12532 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
12533 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
12534 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12535 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12536 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi)
12537 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12538 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi)
12539 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12540 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi)
12541 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12542 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi)
12543 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12544 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx)
12545 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12546 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx)
12547 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12548 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx)
12549 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12550 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx)
12551 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12552 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx)
12553 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12554 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx)
12555 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12556 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx)
12557 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12558 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx)
12559 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12560 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8)
12561 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12562 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8)
12563 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12564 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8)
12565 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12566 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8)
12567 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12568 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9)
12569 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12570 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9)
12571 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12572 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9)
12573 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12574 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9)
12575 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12576 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12577 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax)
12578 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12579 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax)
12580 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12581 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax)
12582 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax)
12583 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12584 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax)
12585 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax)
12586 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax)
12587 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax)
12588 ; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8
12589 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
12590 ; AVX2-FAST-PERLANE-NEXT: retq
12592 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64:
12593 ; AVX512F-ONLY-SLOW: # %bb.0:
12594 ; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748
12595 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1
12596 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2
12597 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
12598 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16
12599 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18
12600 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
12601 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12602 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
12603 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
12604 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3
12605 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4
12606 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
12607 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm10
12608 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm13
12609 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
12610 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
12611 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
12612 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2
12613 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12614 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2
12615 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4
12616 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
12617 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22
12618 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
12619 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
12620 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12621 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4
12622 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
12623 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6
12624 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm15
12625 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
12626 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm12
12627 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
12628 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1
12629 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
12630 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm6
12631 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8
12632 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
12633 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
12634 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
12635 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
12636 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12637 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
12638 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11
12639 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm21
12640 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm21[0,1,0,2]
12641 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0
12642 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4
12643 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
12644 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
12645 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
12646 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12647 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0
12648 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
12649 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm14
12650 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
12651 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12652 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12653 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
12654 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17
12655 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19
12656 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12657 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
12658 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12659 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5
12660 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0
12661 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12662 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
12663 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20
12664 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
12665 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
12666 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
12667 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
12668 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12669 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
12670 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
12671 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
12672 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0
12673 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm4[1],xmm0[2,3,4,5,6,7]
12674 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23
12675 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
12676 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
12677 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
12678 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12679 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7
12680 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12
12681 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
12682 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
12683 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3
12684 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10
12685 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11
12686 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
12687 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
12688 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
12689 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
12690 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3
12691 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12692 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0
12693 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm6
12694 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7]
12695 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26
12696 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
12697 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
12698 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
12699 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
12700 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16
12701 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
12702 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12703 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
12704 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
12705 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
12706 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3
12707 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2
12708 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
12709 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
12710 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
12711 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12712 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12713 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
12714 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12715 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12716 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm22
12717 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12718 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
12719 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14
12720 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12721 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1
12722 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12723 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
12724 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
12725 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
12726 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12727 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
12728 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12729 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6
12730 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6
12731 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12732 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0
12733 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
12734 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19
12735 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
12736 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
12737 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
12738 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
12739 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12740 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,1,1,2]
12741 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
12742 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
12743 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
12744 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0
12745 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
12746 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12747 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
12748 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
12749 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
12750 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12751 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
12752 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12753 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12754 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
12755 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
12756 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7
12757 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
12758 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12759 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12760 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
12761 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
12762 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5
12763 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5
12764 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12765 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9
12766 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0
12767 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
12768 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
12769 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
12770 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5
12771 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12772 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2]
12773 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
12774 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
12775 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
12776 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12777 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12778 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28
12779 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
12780 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
12781 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12782 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12783 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
12784 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12785 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12786 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0
12787 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7]
12788 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
12789 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12790 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8
12791 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12792 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7
12793 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1
12794 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12795 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
12796 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
12797 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
12798 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
12799 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
12800 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
12801 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5
12802 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0
12803 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12804 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15
12805 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm10
12806 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
12807 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12808 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12809 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
12810 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12811 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
12812 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12813 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
12814 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12815 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
12816 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6
12817 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7
12818 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
12819 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12820 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12821 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm12
12822 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm13
12823 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
12824 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12825 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12826 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
12827 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
12828 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12829 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5]
12830 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12831 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
12832 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5
12833 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6
12834 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
12835 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0
12836 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12837 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
12838 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
12839 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7]
12840 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
12841 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
12842 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12843 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6
12844 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
12845 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7]
12846 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
12847 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
12848 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12849 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
12850 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, %xmm2
12851 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm4, %xmm8
12852 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4
12853 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
12854 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0
12855 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12856 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
12857 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8
12858 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7]
12859 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
12860 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
12861 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1
12862 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5
12863 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7]
12864 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
12865 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6]
12866 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
12867 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
12868 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11
12869 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm9
12870 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
12871 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
12872 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12873 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
12874 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
12875 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
12876 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
12877 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
12878 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
12879 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12880 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
12881 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
12882 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12883 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
12884 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
12885 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
12886 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
12887 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0
12888 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12889 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
12890 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
12891 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12892 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
12893 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
12894 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
12895 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12896 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
12897 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12898 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
12899 ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
12900 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
12901 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12902 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
12903 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12904 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
12905 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm17
12906 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
12907 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
12908 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
12909 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7]
12910 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12911 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
12912 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
12913 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12914 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
12915 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
12916 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
12917 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
12918 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
12919 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12920 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1
12921 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2
12922 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
12923 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5
12924 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm7
12925 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
12926 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
12927 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3
12928 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2
12929 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
12930 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20
12931 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm4
12932 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
12933 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
12934 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
12935 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
12936 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
12937 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
12938 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
12939 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
12940 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3
12941 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8
12942 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
12943 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21
12944 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
12945 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
12946 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
12947 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
12948 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12949 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12950 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12951 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12952 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
12953 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22
12954 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm23
12955 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
12956 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12957 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
12958 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
12959 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
12960 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12961 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,1,4,5,6,5]
12962 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12963 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12964 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12965 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
12966 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12967 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2
12968 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10
12969 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
12970 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm15
12971 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25
12972 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
12973 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
12974 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14
12975 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10
12976 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
12977 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
12978 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15]
12979 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
12980 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,1]
12981 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
12982 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7]
12983 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12984 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1
12985 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3
12986 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
12987 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12
12988 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
12989 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
12990 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
12991 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
12992 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
12993 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
12994 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12995 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2
12996 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
12997 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm24
12998 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
12999 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7]
13000 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7]
13001 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
13002 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
13003 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
13004 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0
13005 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
13006 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm12
13007 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
13008 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
13009 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15
13010 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
13011 ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25
13012 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
13013 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
13014 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
13015 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
13016 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
13017 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
13018 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
13019 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13020 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
13021 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
13022 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7]
13023 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0
13024 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6
13025 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
13026 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
13027 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
13028 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1
13029 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
13030 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13031 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9
13032 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
13033 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11
13034 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
13035 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
13036 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13037 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
13038 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
13039 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13040 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13041 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13042 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
13043 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21
13044 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
13045 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm11
13046 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13047 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
13048 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm14
13049 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
13050 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13051 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
13052 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
13053 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11
13054 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
13055 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm11
13056 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
13057 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm13
13058 ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25
13059 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
13060 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1]
13061 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
13062 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
13063 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13064 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
13065 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13066 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
13067 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1
13068 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13069 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
13070 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
13071 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11
13072 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
13073 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1
13074 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm1
13075 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
13076 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
13077 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
13078 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
13079 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
13080 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
13081 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13082 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13083 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13084 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0
13085 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1
13086 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
13087 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13088 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
13089 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
13090 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
13091 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13092 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
13093 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
13094 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
13095 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13096 ; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
13097 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
13098 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
13099 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13100 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15
13101 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7]
13102 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13103 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
13104 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2],ymm13[3,4,5],ymm3[6],ymm13[7]
13105 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19
13106 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26
13107 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11
13108 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
13109 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
13110 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0
13111 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm2
13112 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10
13113 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7]
13114 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
13115 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
13116 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12
13117 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0
13118 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
13119 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
13120 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13121 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
13122 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13123 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13124 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0
13125 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1
13126 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
13127 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13128 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
13129 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
13130 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
13131 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13132 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4]
13133 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
13134 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
13135 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13136 ; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
13137 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
13138 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25
13139 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
13140 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24
13141 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm23
13142 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13143 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
13144 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0
13145 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
13146 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21
13147 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22
13148 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
13149 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1
13150 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
13151 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm5
13152 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4
13153 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13154 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
13155 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
13156 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
13157 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
13158 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
13159 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
13160 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13161 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13162 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13163 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
13164 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0
13165 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
13166 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31
13167 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
13168 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm11
13169 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13170 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
13171 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13172 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
13173 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1
13174 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
13175 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm28
13176 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13177 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
13178 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
13179 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
13180 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
13181 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
13182 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13183 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7
13184 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8
13185 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
13186 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
13187 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
13188 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm9
13189 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
13190 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17
13191 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,0,1]
13192 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
13193 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7]
13194 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm26
13195 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27
13196 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
13197 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
13198 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
13199 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
13200 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13201 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
13202 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11
13203 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
13204 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
13205 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm13
13206 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11
13207 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
13208 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13209 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13210 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13211 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7]
13212 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29
13213 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm3
13214 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13215 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15]
13216 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13217 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13218 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
13219 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30
13220 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11
13221 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
13222 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
13223 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
13224 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
13225 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
13226 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13227 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13228 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm3
13229 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13230 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
13231 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm14
13232 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18
13233 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
13234 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
13235 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
13236 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm12
13237 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4
13238 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13239 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
13240 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm21[0,1,0,1]
13241 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7,8,9,10],ymm11[11],ymm3[12,13,14,15]
13242 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1
13243 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm6
13244 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5
13245 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
13246 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11
13247 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7]
13248 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2
13249 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13250 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13251 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1
13252 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13253 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
13254 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0
13255 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
13256 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm24
13257 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm23
13258 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13259 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
13260 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
13261 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17
13262 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21
13263 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13264 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
13265 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2
13266 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3
13267 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
13268 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
13269 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
13270 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
13271 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
13272 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
13273 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1
13274 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13275 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
13276 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
13277 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2
13278 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22
13279 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0
13280 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13281 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13282 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10
13283 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8
13284 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
13285 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13286 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13287 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9
13288 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm7
13289 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
13290 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13291 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
13292 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
13293 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13294 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
13295 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13296 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
13297 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13298 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13299 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13300 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
13301 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20
13302 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13303 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
13304 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
13305 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm12
13306 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7]
13307 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
13308 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
13309 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1
13310 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
13311 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28
13312 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31
13313 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11
13314 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3],xmm11[4,5,6,7]
13315 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4
13316 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
13317 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13318 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
13319 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0
13320 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13321 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18
13322 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1
13323 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7]
13324 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm16
13325 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13326 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13327 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13328 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2
13329 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
13330 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm29
13331 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm13
13332 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13333 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
13334 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
13335 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13336 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
13337 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13338 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13339 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
13340 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm30
13341 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7]
13342 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13343 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
13344 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
13345 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13346 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
13347 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
13348 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
13349 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
13350 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
13351 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0
13352 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13353 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0
13354 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1
13355 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
13356 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13357 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
13358 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1
13359 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2
13360 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
13361 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13362 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
13363 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13364 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
13365 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13366 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
13367 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
13368 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
13369 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
13370 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5
13371 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7
13372 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
13373 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,3,1]
13374 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
13375 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7
13376 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4,5,6,7]
13377 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
13378 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
13379 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13380 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
13381 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
13382 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6
13383 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
13384 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
13385 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7,8],ymm8[9],ymm6[10,11,12,13,14,15]
13386 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm6
13387 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8
13388 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
13389 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
13390 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
13391 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
13392 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
13393 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
13394 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
13395 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
13396 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm19, %zmm13
13397 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
13398 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
13399 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
13400 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6
13401 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm7
13402 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8
13403 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
13404 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
13405 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
13406 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
13407 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7]
13408 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
13409 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm8
13410 ; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
13411 ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7]
13412 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13413 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
13414 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
13415 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13416 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
13417 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
13418 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload
13419 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
13420 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm2
13421 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm4
13422 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
13423 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5
13424 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
13425 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13426 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
13427 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
13428 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
13429 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
13430 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
13431 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
13432 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
13433 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm14 # 32-byte Folded Reload
13434 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
13435 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm23
13436 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
13437 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
13438 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13439 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload
13440 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
13441 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
13442 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1}
13443 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
13444 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
13445 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1}
13446 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7
13447 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
13448 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
13449 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1}
13450 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8
13451 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload
13452 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1}
13453 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
13454 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
13455 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1}
13456 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rsi)
13457 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rsi)
13458 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rdx)
13459 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx)
13460 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx)
13461 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx)
13462 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8)
13463 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r8)
13464 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9)
13465 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13466 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1}
13467 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13468 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
13469 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1}
13470 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r9)
13471 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13472 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax)
13473 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload
13474 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1
13475 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1}
13476 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1}
13477 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax)
13478 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13479 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
13480 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0
13481 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1}
13482 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
13483 ; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748
13484 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper
13485 ; AVX512F-ONLY-SLOW-NEXT: retq
13487 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64:
13488 ; AVX512F-ONLY-FAST: # %bb.0:
13489 ; AVX512F-ONLY-FAST-NEXT: subq $1800, %rsp # imm = 0x708
13490 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5
13491 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
13492 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,5,9,u,12,u,u,u]
13493 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0
13494 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
13495 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2
13496 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29
13497 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4
13498 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm20
13499 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5
13500 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6
13501 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
13502 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21
13503 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12
13504 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5
13505 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
13506 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
13507 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6
13508 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
13509 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
13510 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm17
13511 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7
13512 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
13513 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6
13514 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm27
13515 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7
13516 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm30 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
13517 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm28
13518 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm28[0,1,0,2]
13519 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13520 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
13521 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6
13522 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
13523 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
13524 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8
13525 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
13526 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
13527 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3
13528 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm16
13529 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm14
13530 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm2
13531 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3
13532 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13533 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13534 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15
13535 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13536 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13537 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
13538 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm18
13539 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm19
13540 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
13541 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
13542 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13543 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3
13544 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0
13545 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13546 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
13547 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3
13548 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7]
13549 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6
13550 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
13551 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
13552 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6
13553 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
13554 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
13555 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
13556 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
13557 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
13558 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm5
13559 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
13560 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10
13561 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
13562 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13563 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm13
13564 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15
13565 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
13566 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
13567 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
13568 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8
13569 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm1, %zmm1
13570 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm22
13571 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13572 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1
13573 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13574 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0
13575 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1
13576 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
13577 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23
13578 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
13579 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
13580 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
13581 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm25
13582 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8
13583 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2]
13584 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13585 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6
13586 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
13587 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7]
13588 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4
13589 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm2
13590 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13591 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7]
13592 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8
13593 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,6,9,u,13,u,u,u]
13594 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0
13595 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13596 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1
13597 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13598 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0
13599 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13600 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
13601 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
13602 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
13603 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31
13604 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13605 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm7, %zmm9
13606 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13607 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
13608 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13609 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8
13610 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8
13611 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13612 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
13613 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm29
13614 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27
13615 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
13616 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
13617 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
13618 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8
13619 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13620 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
13621 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm11, %ymm12
13622 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13623 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7]
13624 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
13625 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm20
13626 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm18
13627 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
13628 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm14
13629 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm3
13630 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13631 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7]
13632 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13633 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm3
13634 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13635 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
13636 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
13637 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10
13638 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm7, %zmm7
13639 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13640 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6
13641 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6
13642 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13643 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm6
13644 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7
13645 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
13646 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
13647 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
13648 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6
13649 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13650 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm11, %ymm7
13651 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13652 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
13653 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
13654 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19
13655 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm9
13656 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm2
13657 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13658 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
13659 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9
13660 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
13661 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
13662 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm31, %zmm1, %zmm10
13663 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13664 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10
13665 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13666 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9
13667 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0
13668 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13669 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4
13670 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm5
13671 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
13672 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
13673 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7]
13674 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
13675 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9
13676 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14
13677 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
13678 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm28[0,1,1,3]
13679 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm13
13680 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
13681 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
13682 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8
13683 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm0
13684 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13685 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7]
13686 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm13
13687 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7]
13688 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8
13689 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm12
13690 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6
13691 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm8, %ymm0
13692 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13693 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
13694 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm2
13695 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7]
13696 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
13697 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
13698 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6
13699 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13700 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3]
13701 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10
13702 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
13703 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7
13704 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
13705 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13706 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
13707 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
13708 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7]
13709 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
13710 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3
13711 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13712 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
13713 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm6
13714 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
13715 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6
13716 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm5
13717 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9
13718 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
13719 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm27
13720 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3
13721 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm4
13722 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm21
13723 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13]
13724 ; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
13725 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm21, %zmm24, %zmm6
13726 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
13727 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6
13728 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
13729 ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0
13730 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
13731 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm30, %zmm4, %zmm17 {%k1}
13732 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13733 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
13734 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
13735 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
13736 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0
13737 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13738 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1
13739 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
13740 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1
13741 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm9
13742 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm19, %xmm2
13743 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13744 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
13745 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
13746 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm0
13747 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
13748 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm6
13749 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
13750 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
13751 ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1]
13752 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm8
13753 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
13754 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8
13755 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15]
13756 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm15
13757 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm18, %xmm8
13758 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
13759 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3
13760 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13761 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
13762 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8
13763 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
13764 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm18
13765 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm20
13766 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
13767 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7]
13768 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
13769 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm8
13770 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
13771 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,u,u,u,4,7,11,14]
13772 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm6, %zmm13
13773 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
13774 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm13
13775 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7]
13776 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
13777 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13778 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7
13779 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm24, %zmm13
13780 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm11
13781 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
13782 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload
13783 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13784 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2
13785 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2
13786 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm7
13787 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0
13788 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
13789 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13790 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm14, %xmm2
13791 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
13792 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
13793 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13794 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
13795 ; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
13796 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm12, %ymm0
13797 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
13798 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13799 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23
13800 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
13801 ; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
13802 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm17, %zmm7
13803 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
13804 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7
13805 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19
13806 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
13807 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
13808 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
13809 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2
13810 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5
13811 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
13812 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13813 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0
13814 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm7
13815 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
13816 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7
13817 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7]
13818 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13819 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2
13820 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm4
13821 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
13822 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm11
13823 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7]
13824 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm11
13825 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16
13826 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm6
13827 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6
13828 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7]
13829 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
13830 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm10
13831 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm1
13832 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1
13833 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
13834 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
13835 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
13836 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm15
13837 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm4
13838 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6
13839 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
13840 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
13841 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
13842 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [1,u,u,u,4,8,11,15]
13843 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm11, %zmm13
13844 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
13845 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm13
13846 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7]
13847 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
13848 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm10, %zmm17, %zmm13
13849 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
13850 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13
13851 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
13852 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13853 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7
13854 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm8
13855 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
13856 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13
13857 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7]
13858 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
13859 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm11, %zmm6
13860 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3
13861 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
13862 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
13863 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm17, %zmm3
13864 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2
13865 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
13866 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13867 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
13868 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm13
13869 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
13870 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
13871 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
13872 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13873 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15]
13874 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm3, %zmm6
13875 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
13876 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6
13877 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1
13878 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm10, %zmm24, %zmm6
13879 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6
13880 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
13881 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13882 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
13883 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10
13884 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6
13885 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
13886 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13887 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm2
13888 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
13889 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1
13890 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm24, %zmm0
13891 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0
13892 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13893 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13894 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm12, %ymm0
13895 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
13896 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
13897 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm21, %zmm17, %zmm1
13898 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm2
13899 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13900 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
13901 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
13902 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2
13903 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
13904 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
13905 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13906 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1
13907 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm15
13908 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7]
13909 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm7
13910 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
13911 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
13912 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11
13913 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
13914 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
13915 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
13916 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
13917 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
13918 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13919 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9]
13920 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm3, %zmm6
13921 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
13922 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6
13923 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
13924 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
13925 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1
13926 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1
13927 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13928 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13929 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1
13930 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm5
13931 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7]
13932 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm21
13933 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19
13934 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
13935 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
13936 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
13937 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm1
13938 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
13939 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
13940 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm23
13941 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
13942 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
13943 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2
13944 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13945 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13946 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
13947 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13948 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13949 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13950 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13951 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
13952 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
13953 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
13954 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm25
13955 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
13956 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
13957 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13958 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,7,10,14,u,u,u]
13959 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
13960 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm5, %zmm3
13961 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
13962 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3
13963 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
13964 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13965 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm27
13966 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13967 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
13968 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm12
13969 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
13970 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
13971 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
13972 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm22
13973 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm17
13974 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
13975 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
13976 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
13977 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
13978 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13979 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [2,u,u,u,6,9,13,u]
13980 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm20, %zmm13
13981 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
13982 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13
13983 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
13984 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
13985 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3
13986 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm13, %ymm3
13987 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13988 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13989 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13990 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13991 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
13992 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm13
13993 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm3
13994 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
13995 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
13996 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13997 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
13998 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm5, %zmm3
13999 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3
14000 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
14001 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm26
14002 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14003 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5
14004 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4
14005 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
14006 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
14007 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
14008 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
14009 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm20, %zmm3
14010 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0
14011 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm14
14012 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7]
14013 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm20
14014 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
14015 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
14016 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2
14017 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14018 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14019 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0
14020 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14021 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14022 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
14023 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm23
14024 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm26
14025 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14026 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1
14027 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm27
14028 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
14029 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
14030 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28
14031 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14032 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,4,7,11,14,u,u,u]
14033 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm2
14034 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
14035 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
14036 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
14037 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14038 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7]
14039 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm19
14040 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6
14041 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
14042 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm9
14043 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2
14044 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7]
14045 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm13
14046 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7]
14047 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
14048 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0
14049 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14050 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [3,u,u,u,6,10,13,u]
14051 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm18, %zmm1
14052 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
14053 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
14054 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14055 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
14056 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6
14057 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1
14058 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14059 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31
14060 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
14061 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm22
14062 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14063 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm6
14064 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
14065 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm6
14066 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
14067 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14068 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload
14069 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm21, %zmm1
14070 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
14071 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
14072 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
14073 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28
14074 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm27
14075 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8
14076 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7]
14077 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload
14078 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0
14079 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm8
14080 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3
14081 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm6
14082 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
14083 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12
14084 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7]
14085 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm4
14086 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14087 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
14088 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3
14089 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14090 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18
14091 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0
14092 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3
14093 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
14094 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
14095 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
14096 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
14097 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
14098 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
14099 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,4,8,11,15,u,u,u]
14100 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm3, %zmm8
14101 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
14102 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8
14103 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
14104 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5
14105 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
14106 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7
14107 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
14108 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7]
14109 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
14110 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm8
14111 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7
14112 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7]
14113 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
14114 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3]
14115 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm9, %zmm15
14116 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
14117 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15
14118 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7]
14119 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
14120 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5
14121 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14122 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
14123 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
14124 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2
14125 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
14126 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm13
14127 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm4
14128 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
14129 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
14130 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
14131 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm3, %zmm3
14132 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3
14133 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
14134 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm2
14135 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3
14136 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
14137 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10
14138 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
14139 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3
14140 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
14141 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm11
14142 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
14143 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7]
14144 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
14145 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
14146 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm9
14147 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8
14148 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14149 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14150 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
14151 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
14152 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload
14153 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
14154 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload
14155 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload
14156 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm5
14157 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm11
14158 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7]
14159 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14160 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
14161 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
14162 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
14163 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload
14164 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
14165 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
14166 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
14167 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload
14168 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
14169 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
14170 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16
14171 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18
14172 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
14173 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
14174 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
14175 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
14176 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1}
14177 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
14178 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload
14179 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1}
14180 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14181 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
14182 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1}
14183 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
14184 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
14185 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1}
14186 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
14187 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1}
14188 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
14189 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1}
14190 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
14191 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1}
14192 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rsi)
14193 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
14194 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx)
14195 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx)
14196 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx)
14197 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx)
14198 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8)
14199 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r8)
14200 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r9)
14201 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9)
14202 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
14203 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
14204 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14205 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
14206 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1}
14207 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
14208 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
14209 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
14210 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
14211 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax)
14212 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
14213 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1}
14214 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
14215 ; AVX512F-ONLY-FAST-NEXT: addq $1800, %rsp # imm = 0x708
14216 ; AVX512F-ONLY-FAST-NEXT: vzeroupper
14217 ; AVX512F-ONLY-FAST-NEXT: retq
14219 ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64:
14220 ; AVX512DQ-SLOW: # %bb.0:
14221 ; AVX512DQ-SLOW-NEXT: subq $1592, %rsp # imm = 0x638
14222 ; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1
14223 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2
14224 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
14225 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30
14226 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29
14227 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
14228 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
14229 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
14230 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
14231 ; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3
14232 ; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4
14233 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
14234 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17
14235 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm12
14236 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
14237 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
14238 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
14239 ; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm26
14240 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm3
14241 ; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm14
14242 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3]
14243 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
14244 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14245 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14246 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4
14247 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
14248 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6
14249 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7
14250 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
14251 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm8
14252 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
14253 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1
14254 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
14255 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm11
14256 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6
14257 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
14258 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
14259 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
14260 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
14261 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14262 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
14263 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm13
14264 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18
14265 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2]
14266 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0
14267 ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15
14268 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3]
14269 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
14270 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
14271 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14272 ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0
14273 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
14274 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm4
14275 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm16
14276 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
14277 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14278 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
14279 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
14280 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm21
14281 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19
14282 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14283 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
14284 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14285 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5
14286 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0
14287 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14288 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7]
14289 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
14290 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
14291 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
14292 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
14293 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14294 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7]
14295 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14296 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
14297 ; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm10
14298 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7]
14299 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
14300 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
14301 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
14302 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14303 ; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7
14304 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm11
14305 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
14306 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
14307 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3
14308 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0
14309 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1
14310 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
14311 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
14312 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
14313 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
14314 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3
14315 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14316 ; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm2
14317 ; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1
14318 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
14319 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23
14320 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
14321 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
14322 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm5
14323 ; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm22
14324 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm22[0,1,0,2]
14325 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14326 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7]
14327 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14328 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14329 ; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0
14330 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm14[1],xmm0[2,3,4,5,6,7]
14331 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17
14332 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25
14333 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
14334 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
14335 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14336 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14337 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1
14338 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14339 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
14340 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm14
14341 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14342 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14343 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
14344 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4
14345 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14346 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
14347 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14348 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
14349 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
14350 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
14351 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14352 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
14353 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14354 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6
14355 ; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm3
14356 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14357 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7]
14358 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19
14359 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16
14360 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
14361 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
14362 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
14363 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
14364 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14365 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2]
14366 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
14367 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14368 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
14369 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
14370 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm21
14371 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24
14372 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14373 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,2,3]
14374 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
14375 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3
14376 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14377 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
14378 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14379 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14380 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
14381 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
14382 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7
14383 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm13
14384 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14385 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10
14386 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14387 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7]
14388 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
14389 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
14390 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5
14391 ; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm3
14392 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14393 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm15
14394 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
14395 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm9
14396 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
14397 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
14398 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5
14399 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14400 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2]
14401 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
14402 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14403 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14404 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm6
14405 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2
14406 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
14407 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14408 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
14409 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
14410 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
14411 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14412 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
14413 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14414 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14415 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
14416 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
14417 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14418 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8
14419 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14420 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7
14421 ; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0
14422 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14423 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
14424 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
14425 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
14426 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
14427 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
14428 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
14429 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5
14430 ; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0
14431 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14432 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4
14433 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0
14434 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
14435 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14436 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14437 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm18[0,1,1,3]
14438 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
14439 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
14440 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14441 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,1,4,5,6,5]
14442 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14443 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14444 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6
14445 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8
14446 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm21, %xmm7
14447 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14448 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
14449 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14450 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
14451 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14452 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14453 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
14454 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,1,1,3]
14455 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14456 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,1,2,1,4,5,6,5]
14457 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14458 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
14459 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm3
14460 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5
14461 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
14462 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
14463 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14464 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
14465 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12
14466 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
14467 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
14468 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
14469 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
14470 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14471 ; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
14472 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6
14473 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7]
14474 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
14475 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
14476 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14477 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
14478 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0
14479 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm24, %xmm7
14480 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
14481 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm13
14482 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
14483 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14484 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm11
14485 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
14486 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
14487 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
14488 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
14489 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
14490 ; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1
14491 ; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm3
14492 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
14493 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
14494 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
14495 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14496 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
14497 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9
14498 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm17, %xmm8
14499 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
14500 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7
14501 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14502 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7]
14503 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
14504 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
14505 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7]
14506 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
14507 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7
14508 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14509 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
14510 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
14511 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
14512 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15]
14513 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
14514 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
14515 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
14516 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
14517 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14518 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
14519 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14520 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14521 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
14522 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
14523 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
14524 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14525 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,2,1,4,5,6,5]
14526 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14527 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
14528 ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
14529 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
14530 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
14531 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14532 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
14533 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18
14534 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25
14535 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14536 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
14537 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
14538 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5,6,7]
14539 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14540 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
14541 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
14542 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14543 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
14544 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
14545 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14546 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
14547 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
14548 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14549 ; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1
14550 ; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2
14551 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
14552 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6
14553 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7
14554 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
14555 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
14556 ; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3
14557 ; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2
14558 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
14559 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm24
14560 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5
14561 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
14562 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
14563 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
14564 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
14565 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
14566 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
14567 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
14568 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14569 ; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3
14570 ; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8
14571 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
14572 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm9
14573 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
14574 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
14575 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14576 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
14577 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14578 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14579 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14580 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22
14581 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
14582 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm20
14583 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21
14584 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
14585 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14586 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
14587 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
14588 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
14589 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14590 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[0,1,2,1,4,5,6,5]
14591 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14592 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
14593 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14594 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
14595 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14596 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2
14597 ; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3
14598 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
14599 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11
14600 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
14601 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
14602 ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14
14603 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0
14604 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
14605 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
14606 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7,8,9,10,11,12,13],ymm12[14],ymm10[15]
14607 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm10, %ymm1
14608 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
14609 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
14610 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7]
14611 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14612 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1
14613 ; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2
14614 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
14615 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
14616 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
14617 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
14618 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7]
14619 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
14620 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
14621 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm12[6,7]
14622 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14623 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
14624 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19
14625 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm15
14626 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12
14627 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
14628 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7]
14629 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm3
14630 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
14631 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
14632 ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
14633 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10
14634 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
14635 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12
14636 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
14637 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
14638 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
14639 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23
14640 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4
14641 ; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm28
14642 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm28[2,1,2,3]
14643 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14644 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
14645 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
14646 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
14647 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14648 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7]
14649 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14650 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
14651 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
14652 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6],xmm11[7]
14653 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10
14654 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1
14655 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
14656 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
14657 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
14658 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0
14659 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4,5,6],xmm0[7]
14660 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
14661 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
14662 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
14663 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
14664 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14665 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
14666 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
14667 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
14668 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
14669 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
14670 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14671 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
14672 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
14673 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm10
14674 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14675 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6,7,8],ymm0[9],ymm10[10,11,12,13,14,15]
14676 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm1
14677 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm13
14678 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7]
14679 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12
14680 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
14681 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
14682 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm10, %ymm10
14683 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
14684 ; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm10
14685 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2
14686 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
14687 ; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm28
14688 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14689 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,2,1]
14690 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
14691 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
14692 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14693 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
14694 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14695 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
14696 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload
14697 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
14698 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
14699 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
14700 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14701 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10
14702 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7]
14703 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11
14704 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14705 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
14706 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
14707 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15
14708 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7]
14709 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11
14710 ; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm11
14711 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
14712 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
14713 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14714 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
14715 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7]
14716 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
14717 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14718 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
14719 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
14720 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload
14721 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
14722 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14723 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0
14724 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11
14725 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7]
14726 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14727 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
14728 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14729 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14730 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14731 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[0,1,2,0,4,5,6,4]
14732 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14733 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
14734 ; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
14735 ; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
14736 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
14737 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
14738 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
14739 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19
14740 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14741 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14742 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
14743 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm26
14744 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
14745 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
14746 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
14747 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14748 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
14749 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
14750 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
14751 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm13
14752 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0
14753 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
14754 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
14755 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14756 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
14757 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
14758 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload
14759 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
14760 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0
14761 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2
14762 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7]
14763 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14764 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
14765 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14766 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14767 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14768 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,2,0,4,5,6,4]
14769 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14770 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
14771 ; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
14772 ; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
14773 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
14774 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm29
14775 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
14776 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm24
14777 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm25
14778 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14779 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14780 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14781 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
14782 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm23
14783 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21
14784 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
14785 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm11
14786 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
14787 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm6
14788 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20
14789 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
14790 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
14791 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0
14792 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
14793 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,5,4]
14794 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
14795 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
14796 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
14797 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload
14798 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
14799 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14800 ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14801 ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
14802 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
14803 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm12
14804 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14805 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7,8,9,10,11,12,13],ymm0[14],ymm12[15]
14806 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14807 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14808 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
14809 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31
14810 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30
14811 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
14812 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
14813 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
14814 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
14815 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
14816 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2,3,4,5,6,7]
14817 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm8
14818 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
14819 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18
14820 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12
14821 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
14822 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14823 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
14824 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13
14825 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14826 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
14827 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
14828 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5
14829 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm4
14830 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
14831 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10
14832 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
14833 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
14834 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
14835 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14836 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
14837 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm12
14838 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
14839 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
14840 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10
14841 ; AVX512DQ-SLOW-NEXT: vpor %ymm12, %ymm10, %ymm10
14842 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm28
14843 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
14844 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14
14845 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload
14846 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
14847 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14848 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
14849 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
14850 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0
14851 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm7
14852 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7]
14853 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm10
14854 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14855 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7,8,9,10,11,12,13],ymm0[14],ymm10[15]
14856 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
14857 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
14858 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7
14859 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9
14860 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
14861 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
14862 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
14863 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
14864 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
14865 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
14866 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7]
14867 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14868 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm12
14869 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm9
14870 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm7
14871 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
14872 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10
14873 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
14874 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
14875 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14
14876 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3
14877 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14878 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
14879 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm23[0,1,0,1]
14880 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14881 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15]
14882 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm10, %ymm1
14883 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm6
14884 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm15
14885 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
14886 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
14887 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
14888 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2
14889 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14890 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14891 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1
14892 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14893 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
14894 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14895 ; AVX512DQ-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
14896 ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
14897 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14898 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
14899 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13
14900 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
14901 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20
14902 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
14903 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
14904 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
14905 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21
14906 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25
14907 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10
14908 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
14909 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
14910 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
14911 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
14912 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14913 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14914 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
14915 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0
14916 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14917 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14918 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
14919 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7]
14920 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26
14921 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
14922 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
14923 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5
14924 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm11
14925 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7]
14926 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm18
14927 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12
14928 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
14929 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
14930 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14931 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
14932 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
14933 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
14934 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1
14935 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3,4,5,6,7]
14936 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
14937 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14938 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload
14939 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
14940 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
14941 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14942 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
14943 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
14944 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7]
14945 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm30
14946 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm14
14947 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
14948 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
14949 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
14950 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9
14951 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7]
14952 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31
14953 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10
14954 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
14955 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
14956 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14957 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14958 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0
14959 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14960 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm6
14961 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5
14962 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
14963 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
14964 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
14965 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1
14966 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10
14967 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15
14968 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
14969 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
14970 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
14971 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
14972 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14973 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
14974 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14975 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
14976 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
14977 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm7
14978 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
14979 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0
14980 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
14981 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14982 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
14983 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1
14984 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7]
14985 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
14986 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
14987 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
14988 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
14989 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
14990 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
14991 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
14992 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
14993 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0
14994 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7]
14995 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
14996 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
14997 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
14998 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3
14999 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
15000 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
15001 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
15002 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
15003 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
15004 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
15005 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15006 ; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15007 ; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
15008 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,3,1]
15009 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
15010 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm4
15011 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
15012 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
15013 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
15014 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15015 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
15016 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15017 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload
15018 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
15019 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
15020 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
15021 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15]
15022 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15023 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
15024 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
15025 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
15026 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
15027 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5
15028 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
15029 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
15030 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
15031 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
15032 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15033 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm6
15034 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1
15035 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
15036 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
15037 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
15038 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
15039 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4
15040 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
15041 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
15042 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
15043 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
15044 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
15045 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
15046 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
15047 ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
15048 ; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
15049 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1]
15050 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0
15051 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7]
15052 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15053 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
15054 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15055 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
15056 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15057 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15058 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
15059 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
15060 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
15061 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
15062 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload
15063 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
15064 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
15065 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3
15066 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi)
15067 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
15068 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
15069 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx)
15070 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15071 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
15072 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15073 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx)
15074 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%r8)
15075 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8)
15076 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15077 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9)
15078 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15079 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
15080 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15081 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, (%rax)
15082 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax)
15083 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15084 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
15085 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
15086 ; AVX512DQ-SLOW-NEXT: addq $1592, %rsp # imm = 0x638
15087 ; AVX512DQ-SLOW-NEXT: vzeroupper
15088 ; AVX512DQ-SLOW-NEXT: retq
15090 ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64:
15091 ; AVX512DQ-FAST: # %bb.0:
15092 ; AVX512DQ-FAST-NEXT: subq $1304, %rsp # imm = 0x518
15093 ; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2
15094 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
15095 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,5,9,u,12,u,u,u]
15096 ; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm17, %zmm0
15097 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
15098 ; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm1
15099 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27
15100 ; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm4
15101 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18
15102 ; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm5
15103 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm12
15104 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7]
15105 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm22
15106 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5
15107 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
15108 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
15109 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6
15110 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
15111 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
15112 ; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm20
15113 ; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7
15114 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
15115 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6
15116 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm11
15117 ; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7
15118 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
15119 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15120 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24
15121 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2]
15122 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15123 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1
15124 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6
15125 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
15126 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
15127 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8
15128 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
15129 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
15130 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3
15131 ; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm3, %ymm31
15132 ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm2
15133 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm1
15134 ; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3
15135 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
15136 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15137 ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13
15138 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15139 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
15140 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
15141 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm15
15142 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16
15143 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
15144 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
15145 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15146 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
15147 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
15148 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15149 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm0
15150 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm5
15151 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm0[2],ymm5[3,4,5],ymm0[6],ymm5[7]
15152 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm19
15153 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6
15154 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
15155 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
15156 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6
15157 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
15158 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
15159 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
15160 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
15161 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7]
15162 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm30
15163 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28
15164 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
15165 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10
15166 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
15167 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15168 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0
15169 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7]
15170 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm23
15171 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
15172 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
15173 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8
15174 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm2
15175 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
15176 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2
15177 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15178 ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0
15179 ; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1
15180 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
15181 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25
15182 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17
15183 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8
15184 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7]
15185 ; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm22
15186 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4
15187 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2]
15188 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
15189 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6
15190 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
15191 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7]
15192 ; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm14
15193 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm1
15194 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15195 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
15196 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7
15197 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u]
15198 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
15199 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15200 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0
15201 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15202 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm15[1],ymm0[2,3],ymm15[4],ymm0[5,6,7]
15203 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm3
15204 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15205 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
15206 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
15207 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15208 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8
15209 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15210 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8
15211 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15212 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6
15213 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm2
15214 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15215 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm11
15216 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7]
15217 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
15218 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
15219 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
15220 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm6
15221 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm16
15222 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
15223 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5]
15224 ; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm10, %ymm13
15225 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15226 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6],ymm13[7]
15227 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm2
15228 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm8
15229 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
15230 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm21
15231 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm19
15232 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15233 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm15
15234 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm2
15235 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15236 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm2
15237 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15238 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm12
15239 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15240 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7]
15241 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15
15242 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
15243 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9
15244 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm4
15245 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15246 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4
15247 ; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm4
15248 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15249 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm4
15250 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7
15251 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7]
15252 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7
15253 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7]
15254 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm7
15255 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4
15256 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
15257 ; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm10, %ymm7
15258 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15259 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
15260 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
15261 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm30
15262 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm8
15263 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm1
15264 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15265 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
15266 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8
15267 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
15268 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15]
15269 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm16, %zmm9
15270 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15271 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
15272 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15273 ; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4
15274 ; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm0
15275 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15276 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6,7]
15277 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9
15278 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
15279 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
15280 ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4
15281 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15
15282 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
15283 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,1,1,3]
15284 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm14
15285 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
15286 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
15287 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6
15288 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm0
15289 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15290 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7]
15291 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14
15292 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3],xmm14[4,5,6,7]
15293 ; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6
15294 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm13
15295 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8
15296 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm0
15297 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15298 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm2
15299 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm12
15300 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7]
15301 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
15302 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
15303 ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6
15304 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
15305 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,3]
15306 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm9
15307 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
15308 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7
15309 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm29
15310 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7]
15311 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
15312 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
15313 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
15314 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1
15315 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15316 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
15317 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4
15318 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
15319 ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4
15320 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5
15321 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm21, %xmm7
15322 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
15323 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm27
15324 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1
15325 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1
15326 ; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm21
15327 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [3,6,10,13,3,6,10,13]
15328 ; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
15329 ; AVX512DQ-FAST-NEXT: vpermd %zmm21, %zmm28, %zmm3
15330 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
15331 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
15332 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15333 ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0
15334 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
15335 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm20 {%k1} # 16-byte Folded Reload
15336 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15337 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7]
15338 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15339 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
15340 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0
15341 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15342 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm1
15343 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
15344 ; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1
15345 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm4
15346 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm30, %xmm2
15347 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
15348 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
15349 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
15350 ; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm0, %ymm1
15351 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
15352 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm3
15353 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2
15354 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
15355 ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
15356 ; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm1, %zmm6
15357 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
15358 ; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6
15359 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7],ymm6[8,9,10,11,12],ymm3[13,14,15]
15360 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm12
15361 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm19, %xmm6
15362 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
15363 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3
15364 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15365 ; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm6
15366 ; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm10
15367 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7]
15368 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm16
15369 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm17
15370 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
15371 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7]
15372 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
15373 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3
15374 ; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
15375 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,u,u,u,4,7,11,14]
15376 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm20, %zmm14
15377 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
15378 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm14
15379 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3,4,5,6],xmm14[7]
15380 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
15381 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
15382 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm8
15383 ; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm28, %zmm14
15384 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9
15385 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7]
15386 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 {%k1} # 16-byte Folded Reload
15387 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15388 ; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm0, %ymm0
15389 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0
15390 ; AVX512DQ-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm8
15391 ; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8
15392 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
15393 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
15394 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm9, %xmm8
15395 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
15396 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm31
15397 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm0
15398 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15399 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0]
15400 ; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1]
15401 ; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm7, %ymm0
15402 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
15403 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13]
15404 ; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1]
15405 ; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm8, %zmm2
15406 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
15407 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
15408 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
15409 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15410 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4
15411 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
15412 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
15413 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15414 ; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
15415 ; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm2
15416 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
15417 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2
15418 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15419 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15420 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0
15421 ; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm15
15422 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
15423 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2
15424 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14
15425 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7]
15426 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10
15427 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18
15428 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm20, %zmm14
15429 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6
15430 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3,4,5,6],xmm6[7]
15431 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
15432 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14
15433 ; AVX512DQ-FAST-NEXT: vpermd %zmm14, %zmm1, %zmm1
15434 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
15435 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm1[6,7]
15436 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15437 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
15438 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11
15439 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6
15440 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
15441 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
15442 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15443 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [1,u,u,u,4,8,11,15]
15444 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm20, %zmm10
15445 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
15446 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm10
15447 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7]
15448 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
15449 ; AVX512DQ-FAST-NEXT: vpermd %zmm14, %zmm8, %zmm10
15450 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
15451 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10
15452 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7]
15453 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15454 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5
15455 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0
15456 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7]
15457 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10
15458 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6],xmm10[7]
15459 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15460 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm20, %zmm6
15461 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3
15462 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
15463 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
15464 ; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm8, %zmm3
15465 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2
15466 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
15467 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15468 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
15469 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm16
15470 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15471 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
15472 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
15473 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15474 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15]
15475 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm3, %zmm6
15476 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
15477 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6
15478 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1
15479 ; AVX512DQ-FAST-NEXT: vpermd %zmm14, %zmm28, %zmm6
15480 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6
15481 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
15482 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15483 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15484 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
15485 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
15486 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
15487 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
15488 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15489 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7]
15490 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm14
15491 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6
15492 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
15493 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15494 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm2
15495 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
15496 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1
15497 ; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm28, %zmm0
15498 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0
15499 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15500 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15501 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
15502 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
15503 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15504 ; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm7, %ymm0
15505 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
15506 ; AVX512DQ-FAST-NEXT: vpermd %zmm21, %zmm8, %zmm1
15507 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
15508 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
15509 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm1
15510 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
15511 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2
15512 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
15513 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
15514 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15515 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1
15516 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2
15517 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
15518 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11
15519 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm8
15520 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
15521 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
15522 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm7
15523 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7]
15524 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
15525 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
15526 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
15527 ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
15528 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15529 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9]
15530 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm9, %zmm6
15531 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
15532 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6
15533 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
15534 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
15535 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm2
15536 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2
15537 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
15538 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15539 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
15540 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
15541 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15542 ; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1
15543 ; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm13
15544 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7]
15545 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31
15546 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
15547 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
15548 ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
15549 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm2
15550 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
15551 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
15552 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm23
15553 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm22
15554 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9
15555 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7]
15556 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3
15557 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15558 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
15559 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2
15560 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
15561 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload
15562 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
15563 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15564 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15565 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15566 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
15567 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
15568 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm2
15569 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm17
15570 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
15571 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
15572 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
15573 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,7,10,14,u,u,u]
15574 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
15575 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm21, %zmm3
15576 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
15577 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
15578 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm5
15579 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
15580 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm27
15581 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
15582 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm12
15583 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm19
15584 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
15585 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
15586 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7]
15587 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm11
15588 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10
15589 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
15590 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
15591 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0
15592 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15593 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,6,9,13,u]
15594 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm2
15595 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
15596 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2
15597 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
15598 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
15599 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3
15600 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2
15601 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
15602 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
15603 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
15604 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15605 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7]
15606 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20
15607 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm8
15608 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2
15609 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
15610 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
15611 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
15612 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
15613 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm2
15614 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
15615 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29
15616 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
15617 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26
15618 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm5
15619 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
15620 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
15621 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
15622 ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0
15623 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm2
15624 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1
15625 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4
15626 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm7
15627 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7]
15628 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
15629 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
15630 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2
15631 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15632 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
15633 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
15634 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15635 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm26 {%k1}
15636 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
15637 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm23
15638 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm31
15639 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
15640 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
15641 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15642 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
15643 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24
15644 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
15645 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,4,7,11,14,u,u,u]
15646 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm22, %zmm3
15647 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
15648 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
15649 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
15650 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm6
15651 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
15652 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm21
15653 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10
15654 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
15655 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
15656 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm19
15657 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
15658 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
15659 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
15660 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3
15661 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
15662 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [3,u,u,u,6,10,13,u]
15663 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm1
15664 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
15665 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15666 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
15667 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
15668 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10
15669 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1
15670 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
15671 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
15672 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
15673 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm10
15674 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
15675 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
15676 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
15677 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm8
15678 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
15679 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
15680 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm22, %zmm3
15681 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
15682 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm3[2,3,4,5,6,7]
15683 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7]
15684 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20
15685 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
15686 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
15687 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
15688 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm3
15689 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2
15690 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
15691 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22
15692 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5
15693 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
15694 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3
15695 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15696 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
15697 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2
15698 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15699 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload
15700 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
15701 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1
15702 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2
15703 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
15704 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15705 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
15706 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
15707 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
15708 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
15709 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,4,8,11,15,u,u,u]
15710 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm2
15711 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm15
15712 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
15713 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7]
15714 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm8
15715 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
15716 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15717 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
15718 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2
15719 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7]
15720 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
15721 ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6
15722 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2
15723 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
15724 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
15725 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3]
15726 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm8
15727 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
15728 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8
15729 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
15730 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
15731 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
15732 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15733 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
15734 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
15735 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
15736 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
15737 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
15738 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6
15739 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4
15740 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
15741 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
15742 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm5, %zmm4
15743 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4
15744 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
15745 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
15746 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
15747 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7]
15748 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm4
15749 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7]
15750 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11
15751 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7]
15752 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4
15753 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm8
15754 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
15755 ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3
15756 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8
15757 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
15758 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
15759 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
15760 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm7
15761 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
15762 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
15763 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
15764 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
15765 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15766 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
15767 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
15768 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15769 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15770 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
15771 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
15772 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
15773 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
15774 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
15775 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
15776 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm10
15777 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm8
15778 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rsi)
15779 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi)
15780 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx)
15781 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rdx)
15782 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15783 ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rcx)
15784 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15785 ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rcx)
15786 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15787 ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r8)
15788 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15789 ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r8)
15790 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9)
15791 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%r9)
15792 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
15793 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax)
15794 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
15795 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
15796 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
15797 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax)
15798 ; AVX512DQ-FAST-NEXT: addq $1304, %rsp # imm = 0x518
15799 ; AVX512DQ-FAST-NEXT: vzeroupper
15800 ; AVX512DQ-FAST-NEXT: retq
15802 ; AVX512BW-LABEL: load_i16_stride7_vf64:
15803 ; AVX512BW: # %bb.0:
15804 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15805 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
15806 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0
15807 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2
15808 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5
15809 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3
15810 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6
15811 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7
15812 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4
15813 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13
15814 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15
15815 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10
15816 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11
15817 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
15818 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14
15819 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12
15820 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
15821 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
15822 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17
15823 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
15824 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
15825 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9
15826 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
15827 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
15828 ; AVX512BW-NEXT: kmovd %edi, %k2
15829 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
15830 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
15831 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
15832 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18
15833 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
15834 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
15835 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15836 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
15837 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
15838 ; AVX512BW-NEXT: kmovd %edi, %k1
15839 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
15840 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
15841 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
15842 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
15843 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
15844 ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
15845 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
15846 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
15847 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
15848 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19
15849 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
15850 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
15851 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
15852 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
15853 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
15854 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22
15855 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
15856 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
15857 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
15858 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16
15859 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
15860 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
15861 ; AVX512BW-NEXT: kmovd %edi, %k2
15862 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
15863 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
15864 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
15865 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
15866 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
15867 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
15868 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
15869 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
15870 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
15871 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
15872 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21
15873 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
15874 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
15875 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
15876 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
15877 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
15878 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
15879 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24
15880 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
15881 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
15882 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19
15883 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
15884 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
15885 ; AVX512BW-NEXT: kmovd %edi, %k1
15886 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
15887 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
15888 ; AVX512BW-NEXT: kmovd %edi, %k2
15889 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
15890 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
15891 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
15892 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
15893 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
15894 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
15895 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
15896 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
15897 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
15898 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23
15899 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
15900 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
15901 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15902 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
15903 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
15904 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
15905 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26
15906 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
15907 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
15908 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21
15909 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
15910 ; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
15911 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
15912 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
15913 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
15914 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
15915 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
15916 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
15917 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
15918 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
15919 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15920 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
15921 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
15922 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
15923 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
15924 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
15925 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
15926 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
15927 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28
15928 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
15929 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
15930 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23
15931 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
15932 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
15933 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
15934 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
15935 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
15936 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
15937 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
15938 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
15939 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
15940 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
15941 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15942 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
15943 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
15944 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
15945 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
15946 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
15947 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
15948 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
15949 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28
15950 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
15951 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
15952 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30
15953 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
15954 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
15955 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
15956 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
15957 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
15958 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
15959 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
15960 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
15961 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
15962 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
15963 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15964 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
15965 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
15966 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
15967 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
15968 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
15969 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
15970 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
15971 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
15972 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
15973 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
15974 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
15975 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
15976 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
15977 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
15978 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
15979 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
15980 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
15981 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi)
15982 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi)
15983 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
15984 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx)
15985 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx)
15986 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
15987 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8)
15988 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8)
15989 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9)
15990 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9)
15991 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r10)
15992 ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r10)
15993 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
15994 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
15995 ; AVX512BW-NEXT: vzeroupper
15996 ; AVX512BW-NEXT: retq
15997 %wide.vec = load <448 x i16>, ptr %in.vec, align 64
15998 %strided.vec0 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
15999 %strided.vec1 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
16000 %strided.vec2 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
16001 %strided.vec3 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
16002 %strided.vec4 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
16003 %strided.vec5 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
16004 %strided.vec6 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
16005 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
16006 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
16007 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
16008 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
16009 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
16010 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
16011 store <64 x i16> %strided.vec6, ptr %out.vec6, align 64
16014 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
16018 ; AVX2-ONLY: {{.*}}
16020 ; AVX512-FAST: {{.*}}
16021 ; AVX512BW-ONLY: {{.*}}
16022 ; AVX512BW-ONLY-FAST: {{.*}}
16023 ; AVX512BW-ONLY-SLOW: {{.*}}
16024 ; AVX512BW-SLOW: {{.*}}
16025 ; AVX512DQ-ONLY: {{.*}}
16026 ; AVX512DQBW-FAST: {{.*}}
16027 ; AVX512DQBW-ONLY: {{.*}}
16028 ; AVX512DQBW-SLOW: {{.*}}
16030 ; AVX512F-ONLY: {{.*}}
16031 ; FALLBACK0: {{.*}}
16032 ; FALLBACK1: {{.*}}
16033 ; FALLBACK10: {{.*}}
16034 ; FALLBACK11: {{.*}}
16035 ; FALLBACK12: {{.*}}
16036 ; FALLBACK2: {{.*}}
16037 ; FALLBACK3: {{.*}}
16038 ; FALLBACK4: {{.*}}
16039 ; FALLBACK5: {{.*}}
16040 ; FALLBACK6: {{.*}}
16041 ; FALLBACK7: {{.*}}
16042 ; FALLBACK8: {{.*}}
16043 ; FALLBACK9: {{.*}}