1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19 ; SSE-LABEL: load_i16_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movdqa (%rdi), %xmm0
24 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
25 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
27 ; SSE-NEXT: movdqa %xmm0, %xmm3
28 ; SSE-NEXT: psrld $16, %xmm3
29 ; SSE-NEXT: movdqa %xmm3, %xmm4
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
31 ; SSE-NEXT: movdqa %xmm0, %xmm5
32 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
34 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
35 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
36 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
37 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
38 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
39 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
40 ; SSE-NEXT: psrlq $48, %xmm1
41 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
43 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
44 ; SSE-NEXT: movd %xmm2, (%rsi)
45 ; SSE-NEXT: movd %xmm4, (%rdx)
46 ; SSE-NEXT: movd %xmm6, (%rcx)
47 ; SSE-NEXT: movd %xmm5, (%r8)
48 ; SSE-NEXT: movd %xmm7, (%r9)
49 ; SSE-NEXT: movd %xmm3, (%r10)
50 ; SSE-NEXT: movd %xmm0, (%rax)
53 ; AVX1-ONLY-LABEL: load_i16_stride7_vf2:
55 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
56 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
57 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
58 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
59 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
60 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
61 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm3
62 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
63 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
64 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
65 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
66 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
67 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm7
68 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
69 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
70 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
71 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
72 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
73 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
74 ; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi)
75 ; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
76 ; AVX1-ONLY-NEXT: vmovd %xmm6, (%rcx)
77 ; AVX1-ONLY-NEXT: vpextrd $2, %xmm5, (%r8)
78 ; AVX1-ONLY-NEXT: vmovd %xmm7, (%r9)
79 ; AVX1-ONLY-NEXT: vmovd %xmm3, (%r10)
80 ; AVX1-ONLY-NEXT: vmovd %xmm0, (%rax)
81 ; AVX1-ONLY-NEXT: retq
83 ; AVX2-SLOW-LABEL: load_i16_stride7_vf2:
85 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
86 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
87 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
88 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
89 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
90 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
91 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
92 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
93 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
94 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
95 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
96 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
97 ; AVX2-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
98 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
99 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
100 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
101 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
102 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
103 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
104 ; AVX2-SLOW-NEXT: vmovd %xmm2, (%rsi)
105 ; AVX2-SLOW-NEXT: vmovd %xmm4, (%rdx)
106 ; AVX2-SLOW-NEXT: vmovd %xmm6, (%rcx)
107 ; AVX2-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
108 ; AVX2-SLOW-NEXT: vmovd %xmm7, (%r9)
109 ; AVX2-SLOW-NEXT: vmovd %xmm3, (%r10)
110 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rax)
111 ; AVX2-SLOW-NEXT: retq
113 ; AVX2-FAST-LABEL: load_i16_stride7_vf2:
114 ; AVX2-FAST: # %bb.0:
115 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
116 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
117 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
118 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
119 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
120 ; AVX2-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
121 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
122 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
123 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
124 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7
125 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
126 ; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8
127 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9
128 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
129 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
130 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
131 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
132 ; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi)
133 ; AVX2-FAST-NEXT: vmovd %xmm4, (%rdx)
134 ; AVX2-FAST-NEXT: vmovd %xmm7, (%rcx)
135 ; AVX2-FAST-NEXT: vmovd %xmm5, (%r8)
136 ; AVX2-FAST-NEXT: vmovd %xmm8, (%r9)
137 ; AVX2-FAST-NEXT: vmovd %xmm3, (%r10)
138 ; AVX2-FAST-NEXT: vmovd %xmm0, (%rax)
139 ; AVX2-FAST-NEXT: retq
141 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf2:
142 ; AVX2-FAST-PERLANE: # %bb.0:
143 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
144 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
145 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1
147 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
148 ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm0, %xmm3
149 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
150 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
151 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
152 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7
153 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
154 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 8(%rdi), %xmm8
155 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm1, %xmm9
156 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
157 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
158 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
159 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
160 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm2, (%rsi)
161 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm4, (%rdx)
162 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm7, (%rcx)
163 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm5, (%r8)
164 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm8, (%r9)
165 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm3, (%r10)
166 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax)
167 ; AVX2-FAST-PERLANE-NEXT: retq
169 ; AVX512-SLOW-LABEL: load_i16_stride7_vf2:
170 ; AVX512-SLOW: # %bb.0:
171 ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
172 ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
173 ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
174 ; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
175 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
176 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
177 ; AVX512-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
178 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
179 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
180 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
181 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
182 ; AVX512-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
183 ; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
184 ; AVX512-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
185 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
186 ; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
187 ; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
188 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
189 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
190 ; AVX512-SLOW-NEXT: vmovd %xmm2, (%rsi)
191 ; AVX512-SLOW-NEXT: vmovd %xmm4, (%rdx)
192 ; AVX512-SLOW-NEXT: vmovd %xmm6, (%rcx)
193 ; AVX512-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
194 ; AVX512-SLOW-NEXT: vmovd %xmm7, (%r9)
195 ; AVX512-SLOW-NEXT: vmovd %xmm3, (%r10)
196 ; AVX512-SLOW-NEXT: vmovd %xmm0, (%rax)
197 ; AVX512-SLOW-NEXT: retq
199 ; AVX512F-FAST-LABEL: load_i16_stride7_vf2:
200 ; AVX512F-FAST: # %bb.0:
201 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
202 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
203 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
204 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
205 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
206 ; AVX512F-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
207 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
208 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
210 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7
211 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
212 ; AVX512F-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8
213 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9
214 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
215 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
216 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
217 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
218 ; AVX512F-FAST-NEXT: vmovd %xmm2, (%rsi)
219 ; AVX512F-FAST-NEXT: vmovd %xmm4, (%rdx)
220 ; AVX512F-FAST-NEXT: vmovd %xmm7, (%rcx)
221 ; AVX512F-FAST-NEXT: vmovd %xmm5, (%r8)
222 ; AVX512F-FAST-NEXT: vmovd %xmm8, (%r9)
223 ; AVX512F-FAST-NEXT: vmovd %xmm3, (%r10)
224 ; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax)
225 ; AVX512F-FAST-NEXT: retq
227 ; AVX512BW-FAST-LABEL: load_i16_stride7_vf2:
228 ; AVX512BW-FAST: # %bb.0:
229 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
230 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
231 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
232 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
233 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
234 ; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm3
235 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
236 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
237 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
238 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
239 ; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7
240 ; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8
241 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
242 ; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
243 ; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
244 ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
245 ; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
246 ; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx)
247 ; AVX512BW-FAST-NEXT: vmovd %xmm6, (%rcx)
248 ; AVX512BW-FAST-NEXT: vmovd %xmm5, (%r8)
249 ; AVX512BW-FAST-NEXT: vmovd %xmm7, (%r9)
250 ; AVX512BW-FAST-NEXT: vmovd %xmm3, (%r10)
251 ; AVX512BW-FAST-NEXT: vmovd %xmm8, (%rax)
252 ; AVX512BW-FAST-NEXT: retq
253 %wide.vec = load <14 x i16>, ptr %in.vec, align 64
254 %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
255 %strided.vec1 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
256 %strided.vec2 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
257 %strided.vec3 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
258 %strided.vec4 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
259 %strided.vec5 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
260 %strided.vec6 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
261 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
262 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
263 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
264 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
265 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
266 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
267 store <2 x i16> %strided.vec6, ptr %out.vec6, align 64
271 define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
272 ; SSE-LABEL: load_i16_stride7_vf4:
274 ; SSE-NEXT: movdqa (%rdi), %xmm1
275 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
276 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
277 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
278 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
279 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
280 ; SSE-NEXT: movdqa %xmm0, %xmm5
281 ; SSE-NEXT: pandn %xmm2, %xmm5
282 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
283 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
284 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
285 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
286 ; SSE-NEXT: pand %xmm0, %xmm2
287 ; SSE-NEXT: por %xmm5, %xmm2
288 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
289 ; SSE-NEXT: movdqa %xmm4, %xmm7
290 ; SSE-NEXT: pand %xmm5, %xmm7
291 ; SSE-NEXT: pandn %xmm1, %xmm5
292 ; SSE-NEXT: por %xmm7, %xmm5
293 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
294 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7]
295 ; SSE-NEXT: pand %xmm0, %xmm7
296 ; SSE-NEXT: movdqa %xmm3, %xmm5
297 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
298 ; SSE-NEXT: pandn %xmm5, %xmm0
299 ; SSE-NEXT: por %xmm7, %xmm0
300 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3]
301 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7]
302 ; SSE-NEXT: movdqa %xmm1, %xmm8
303 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
304 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3]
305 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
306 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
307 ; SSE-NEXT: movdqa %xmm3, %xmm10
308 ; SSE-NEXT: movdqa %xmm3, %xmm9
309 ; SSE-NEXT: psrlq $16, %xmm9
310 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
311 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
312 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1]
313 ; SSE-NEXT: pslld $16, %xmm6
314 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
315 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
316 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
317 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
318 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
319 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
320 ; SSE-NEXT: movdqa %xmm1, %xmm10
321 ; SSE-NEXT: psrld $16, %xmm10
322 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
323 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
324 ; SSE-NEXT: psrlq $48, %xmm4
325 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
326 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
327 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
328 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
329 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
330 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
331 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
332 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
333 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
334 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
335 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
336 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
337 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
338 ; SSE-NEXT: movq %xmm2, (%rsi)
339 ; SSE-NEXT: movq %xmm0, (%rdx)
340 ; SSE-NEXT: movq %xmm7, (%rcx)
341 ; SSE-NEXT: movq %xmm8, (%r8)
342 ; SSE-NEXT: movq %xmm6, (%r9)
343 ; SSE-NEXT: movq %xmm10, (%rdi)
344 ; SSE-NEXT: movq %xmm1, (%rax)
347 ; AVX1-ONLY-LABEL: load_i16_stride7_vf4:
348 ; AVX1-ONLY: # %bb.0:
349 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
350 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
351 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
352 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
353 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4
354 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2
355 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
356 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
357 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
358 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
359 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
360 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
361 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
362 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
363 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7]
364 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
365 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
366 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
367 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
368 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
369 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3]
370 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
371 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
372 ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9
373 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
374 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
375 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
376 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
377 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm9
378 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
379 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm10
380 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
381 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
382 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
383 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm10
384 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
385 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
386 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
387 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
388 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7]
389 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
390 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
391 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
392 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
393 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
394 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
395 ; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi)
396 ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rdx)
397 ; AVX1-ONLY-NEXT: vmovq %xmm7, (%rcx)
398 ; AVX1-ONLY-NEXT: vmovq %xmm8, (%r8)
399 ; AVX1-ONLY-NEXT: vmovq %xmm9, (%r9)
400 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%r10)
401 ; AVX1-ONLY-NEXT: vmovq %xmm0, (%rax)
402 ; AVX1-ONLY-NEXT: retq
404 ; AVX2-SLOW-LABEL: load_i16_stride7_vf4:
405 ; AVX2-SLOW: # %bb.0:
406 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
407 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
408 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
409 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
410 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3
411 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4
412 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5
413 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
414 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
415 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
416 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6
417 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
418 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
419 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
420 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
421 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
422 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
423 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
424 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
425 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
426 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
427 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
428 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
429 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
430 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
431 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
432 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
433 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
434 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
435 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
436 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
437 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
438 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
439 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
440 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
441 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
442 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
443 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
444 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
445 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
446 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
447 ; AVX2-SLOW-NEXT: vmovq %xmm1, (%rsi)
448 ; AVX2-SLOW-NEXT: vmovq %xmm6, (%rdx)
449 ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rcx)
450 ; AVX2-SLOW-NEXT: vmovq %xmm4, (%r8)
451 ; AVX2-SLOW-NEXT: vmovq %xmm5, (%r9)
452 ; AVX2-SLOW-NEXT: vmovq %xmm7, (%r10)
453 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
454 ; AVX2-SLOW-NEXT: vzeroupper
455 ; AVX2-SLOW-NEXT: retq
457 ; AVX2-FAST-LABEL: load_i16_stride7_vf4:
458 ; AVX2-FAST: # %bb.0:
459 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
460 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
461 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
462 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
463 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2
464 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3
465 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4
466 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
467 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
468 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
469 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6
470 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
471 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
472 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
473 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
474 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
475 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
476 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
477 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
478 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
479 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
480 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
481 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
482 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm8
483 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4
484 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
485 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
486 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
487 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
488 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7
489 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
490 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
491 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
492 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
493 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
494 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
495 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
496 ; AVX2-FAST-NEXT: vmovq %xmm5, (%rsi)
497 ; AVX2-FAST-NEXT: vmovq %xmm6, (%rdx)
498 ; AVX2-FAST-NEXT: vmovq %xmm2, (%rcx)
499 ; AVX2-FAST-NEXT: vmovq %xmm3, (%r8)
500 ; AVX2-FAST-NEXT: vmovq %xmm4, (%r9)
501 ; AVX2-FAST-NEXT: vmovq %xmm7, (%r10)
502 ; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
503 ; AVX2-FAST-NEXT: vzeroupper
504 ; AVX2-FAST-NEXT: retq
506 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf4:
507 ; AVX2-FAST-PERLANE: # %bb.0:
508 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
509 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
510 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
511 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
512 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2
513 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3
514 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4
515 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
516 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
517 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
518 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6
519 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
520 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
521 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
522 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
523 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
524 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
525 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
526 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
527 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
528 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
529 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
530 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
531 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm8
532 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4
533 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
534 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
535 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
536 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
537 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7
538 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
539 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
540 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
541 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
542 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
543 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
544 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
545 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rsi)
546 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rdx)
547 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rcx)
548 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%r8)
549 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r9)
550 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm7, (%r10)
551 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%rax)
552 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
553 ; AVX2-FAST-PERLANE-NEXT: retq
555 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf4:
556 ; AVX512F-SLOW: # %bb.0:
557 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
558 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
559 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2
560 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3
561 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
562 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
563 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
564 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
565 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
566 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
567 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
568 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
569 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
570 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
571 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
572 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
573 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3
574 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
575 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
576 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
577 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
578 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
579 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
580 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
581 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
582 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
583 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
584 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
585 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
586 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
587 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
588 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
589 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
590 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
591 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
592 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
593 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
594 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3
595 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
596 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
597 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
598 ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rsi)
599 ; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
600 ; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rcx)
601 ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%r8)
602 ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r9)
603 ; AVX512F-SLOW-NEXT: vmovq %xmm7, (%r10)
604 ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rax)
605 ; AVX512F-SLOW-NEXT: vzeroupper
606 ; AVX512F-SLOW-NEXT: retq
608 ; AVX512F-FAST-LABEL: load_i16_stride7_vf4:
609 ; AVX512F-FAST: # %bb.0:
610 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
611 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
612 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
613 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
614 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
615 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
616 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
617 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
618 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4
619 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
620 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
621 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
622 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
623 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
624 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
625 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2
626 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3
627 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
628 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
629 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
630 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
631 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
632 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
633 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8
634 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6
635 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
636 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
637 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
638 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
639 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7
640 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
641 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
642 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
643 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
644 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
645 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
646 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
647 ; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi)
648 ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
649 ; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx)
650 ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8)
651 ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r9)
652 ; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10)
653 ; AVX512F-FAST-NEXT: vmovq %xmm2, (%rax)
654 ; AVX512F-FAST-NEXT: vzeroupper
655 ; AVX512F-FAST-NEXT: retq
657 ; AVX512BW-LABEL: load_i16_stride7_vf4:
659 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
660 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
661 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
662 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
663 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
664 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
665 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
666 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
667 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
668 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
669 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
670 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
671 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
672 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
673 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
674 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
675 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
676 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
677 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
678 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
679 ; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
680 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
681 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
682 ; AVX512BW-NEXT: vmovq %xmm7, (%r10)
683 ; AVX512BW-NEXT: vmovq %xmm8, (%rax)
684 ; AVX512BW-NEXT: vzeroupper
685 ; AVX512BW-NEXT: retq
686 %wide.vec = load <28 x i16>, ptr %in.vec, align 64
687 %strided.vec0 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
688 %strided.vec1 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
689 %strided.vec2 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
690 %strided.vec3 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
691 %strided.vec4 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
692 %strided.vec5 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
693 %strided.vec6 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
694 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
695 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
696 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
697 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
698 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
699 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
700 store <4 x i16> %strided.vec6, ptr %out.vec6, align 64
704 define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
705 ; SSE-LABEL: load_i16_stride7_vf8:
707 ; SSE-NEXT: movdqa (%rdi), %xmm0
708 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
709 ; SSE-NEXT: movaps 32(%rdi), %xmm2
710 ; SSE-NEXT: movaps 48(%rdi), %xmm8
711 ; SSE-NEXT: movdqa 80(%rdi), %xmm7
712 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
713 ; SSE-NEXT: movdqa 96(%rdi), %xmm6
714 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
715 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
716 ; SSE-NEXT: movdqa %xmm11, %xmm4
717 ; SSE-NEXT: pandn %xmm3, %xmm4
718 ; SSE-NEXT: movdqa %xmm1, %xmm5
719 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
720 ; SSE-NEXT: pand %xmm11, %xmm5
721 ; SSE-NEXT: por %xmm4, %xmm5
722 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
723 ; SSE-NEXT: movdqa %xmm3, %xmm10
724 ; SSE-NEXT: pandn %xmm5, %xmm10
725 ; SSE-NEXT: movaps %xmm2, %xmm5
726 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2]
727 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535]
728 ; SSE-NEXT: movaps %xmm4, %xmm12
729 ; SSE-NEXT: andnps %xmm5, %xmm12
730 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
731 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
732 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
733 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
734 ; SSE-NEXT: pand %xmm4, %xmm5
735 ; SSE-NEXT: por %xmm12, %xmm5
736 ; SSE-NEXT: pand %xmm3, %xmm5
737 ; SSE-NEXT: por %xmm10, %xmm5
738 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
739 ; SSE-NEXT: movdqa %xmm6, %xmm10
740 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5]
741 ; SSE-NEXT: movdqa %xmm11, %xmm12
742 ; SSE-NEXT: pandn %xmm10, %xmm12
743 ; SSE-NEXT: movdqa %xmm7, %xmm10
744 ; SSE-NEXT: psrld $16, %xmm10
745 ; SSE-NEXT: movdqa %xmm1, %xmm15
746 ; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
747 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
748 ; SSE-NEXT: pand %xmm11, %xmm15
749 ; SSE-NEXT: por %xmm12, %xmm15
750 ; SSE-NEXT: movdqa %xmm3, %xmm13
751 ; SSE-NEXT: pandn %xmm15, %xmm13
752 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535]
753 ; SSE-NEXT: movdqa %xmm10, %xmm12
754 ; SSE-NEXT: pandn %xmm0, %xmm12
755 ; SSE-NEXT: movdqa %xmm9, %xmm15
756 ; SSE-NEXT: pand %xmm10, %xmm15
757 ; SSE-NEXT: por %xmm12, %xmm15
758 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3]
759 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7]
760 ; SSE-NEXT: pand %xmm4, %xmm12
761 ; SSE-NEXT: movaps %xmm2, %xmm15
762 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
763 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1]
764 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7]
765 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
766 ; SSE-NEXT: pandn %xmm15, %xmm4
767 ; SSE-NEXT: movdqa %xmm1, %xmm15
768 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
769 ; SSE-NEXT: por %xmm12, %xmm4
770 ; SSE-NEXT: movdqa %xmm0, %xmm12
771 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
772 ; SSE-NEXT: pand %xmm3, %xmm4
773 ; SSE-NEXT: por %xmm13, %xmm4
774 ; SSE-NEXT: movdqa %xmm1, %xmm13
775 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
776 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
777 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7]
778 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
779 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1]
780 ; SSE-NEXT: pand %xmm11, %xmm15
781 ; SSE-NEXT: pandn %xmm14, %xmm11
782 ; SSE-NEXT: por %xmm15, %xmm11
783 ; SSE-NEXT: movdqa %xmm3, %xmm14
784 ; SSE-NEXT: pandn %xmm11, %xmm14
785 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3]
786 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3]
787 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
788 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1]
789 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3]
790 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
791 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3]
792 ; SSE-NEXT: andps %xmm3, %xmm11
793 ; SSE-NEXT: orps %xmm14, %xmm11
794 ; SSE-NEXT: movdqa %xmm10, %xmm14
795 ; SSE-NEXT: pandn %xmm2, %xmm14
796 ; SSE-NEXT: movdqa %xmm8, %xmm15
797 ; SSE-NEXT: pand %xmm10, %xmm15
798 ; SSE-NEXT: por %xmm14, %xmm15
799 ; SSE-NEXT: movdqa %xmm0, %xmm14
800 ; SSE-NEXT: psrld $16, %xmm14
801 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
802 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7]
803 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
804 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7]
805 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2]
806 ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3]
807 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3]
808 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
809 ; SSE-NEXT: andps %xmm3, %xmm15
810 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
811 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
812 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7]
813 ; SSE-NEXT: pandn %xmm13, %xmm3
814 ; SSE-NEXT: movdqa %xmm2, %xmm13
815 ; SSE-NEXT: psrlq $16, %xmm13
816 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
817 ; SSE-NEXT: por %xmm15, %xmm3
818 ; SSE-NEXT: movdqa %xmm7, %xmm15
819 ; SSE-NEXT: pand %xmm10, %xmm15
820 ; SSE-NEXT: pandn %xmm1, %xmm10
821 ; SSE-NEXT: por %xmm15, %xmm10
822 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3]
823 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
824 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
825 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
826 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
827 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7]
828 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
829 ; SSE-NEXT: psrlq $48, %xmm9
830 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
831 ; SSE-NEXT: movdqa %xmm2, %xmm9
832 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
833 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1]
834 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
835 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
836 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1]
837 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
838 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
839 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
840 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0]
841 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
842 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
843 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7]
844 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
845 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
846 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
847 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
848 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
849 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2]
850 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
851 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
852 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
853 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
854 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
855 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
856 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
857 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
858 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7]
859 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
860 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
861 ; SSE-NEXT: movdqa %xmm5, (%rsi)
862 ; SSE-NEXT: movdqa %xmm4, (%rdx)
863 ; SSE-NEXT: movaps %xmm11, (%rcx)
864 ; SSE-NEXT: movdqa %xmm3, (%r8)
865 ; SSE-NEXT: movapd %xmm13, (%r9)
866 ; SSE-NEXT: movaps %xmm14, (%rdi)
867 ; SSE-NEXT: movapd %xmm1, (%rax)
870 ; AVX1-ONLY-LABEL: load_i16_stride7_vf8:
871 ; AVX1-ONLY: # %bb.0:
872 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
873 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10
874 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2
875 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3
876 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
877 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4
878 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
879 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6],xmm1[7]
880 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
881 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
882 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5
883 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7
884 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,3,3]
885 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,3]
886 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
887 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
888 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm5[2],xmm7[2],zero
889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
890 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
891 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8
892 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
893 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
894 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
895 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6],xmm9[7]
896 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
897 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,8,9,6,7,u,u,u,u,u,u]
898 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
899 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
900 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,0,3,3,4,5,6,7]
901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7]
902 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7]
903 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
904 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7]
905 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1]
906 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
907 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3]
908 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,1,0,3]
909 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
910 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm11[1],xmm10[1]
911 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
912 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,1,2,3]
913 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
914 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3,4,5,6,7]
915 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
916 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,6,5,6,7]
917 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2]
918 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7]
919 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7]
920 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
921 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
922 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
923 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
924 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7]
925 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
926 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
927 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
928 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm11
929 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
930 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm12
931 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
932 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
934 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,1,0,3]
935 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
936 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
937 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
938 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
939 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4,5,6,7]
940 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12
941 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
942 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
943 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,5,6,7]
944 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,2,2]
945 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
946 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
947 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,3]
948 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,7,7]
949 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
950 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
951 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,2]
952 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5,6,7]
953 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
954 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
955 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
956 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
958 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
959 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
960 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
961 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
962 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
963 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
964 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
965 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
966 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsi)
967 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rdx)
968 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx)
969 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%r8)
970 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%r9)
971 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%r10)
972 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax)
973 ; AVX1-ONLY-NEXT: retq
975 ; AVX2-SLOW-LABEL: load_i16_stride7_vf8:
976 ; AVX2-SLOW: # %bb.0:
977 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
978 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
979 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
980 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
981 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0
982 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
983 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
984 ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
985 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
986 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
987 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
988 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
989 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
990 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
991 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
992 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7
993 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8
994 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
995 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
996 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
997 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
998 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
999 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1000 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1001 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1002 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1003 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1004 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
1005 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
1006 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1007 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1008 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1009 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1010 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1011 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1012 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1013 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1014 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1015 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1016 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1017 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7]
1018 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
1019 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1020 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1021 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1022 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1023 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1024 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
1025 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1026 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
1027 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1028 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1029 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1030 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
1031 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
1032 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
1033 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1034 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
1035 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1036 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1037 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1038 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
1039 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
1040 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1041 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1042 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
1043 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3
1044 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
1045 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
1046 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1047 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1048 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1049 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1050 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1051 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsi)
1052 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rdx)
1053 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, (%rcx)
1054 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, (%r8)
1055 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, (%r9)
1056 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, (%r10)
1057 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rax)
1058 ; AVX2-SLOW-NEXT: vzeroupper
1059 ; AVX2-SLOW-NEXT: retq
1061 ; AVX2-FAST-LABEL: load_i16_stride7_vf8:
1062 ; AVX2-FAST: # %bb.0:
1063 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
1064 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
1065 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
1066 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
1067 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0
1068 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
1069 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1070 ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2
1071 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1072 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1073 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1074 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
1075 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1076 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1077 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1078 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7
1079 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8
1080 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
1081 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
1082 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1083 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1084 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1085 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1086 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1087 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1088 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1089 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1090 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1091 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1092 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1093 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1094 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1095 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1096 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1097 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1098 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1099 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
1100 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1101 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1102 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1103 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1104 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1105 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1106 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1107 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1108 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12
1109 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1110 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1111 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1112 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1113 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
1114 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1115 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1116 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1117 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1118 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1119 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1120 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1121 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1122 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
1123 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1124 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1125 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1126 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1127 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1128 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1129 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1130 ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsi)
1131 ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rdx)
1132 ; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rcx)
1133 ; AVX2-FAST-NEXT: vmovdqa %xmm10, (%r8)
1134 ; AVX2-FAST-NEXT: vmovdqa %xmm11, (%r9)
1135 ; AVX2-FAST-NEXT: vmovdqa %xmm7, (%r10)
1136 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rax)
1137 ; AVX2-FAST-NEXT: vzeroupper
1138 ; AVX2-FAST-NEXT: retq
1140 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf8:
1141 ; AVX2-FAST-PERLANE: # %bb.0:
1142 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1143 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10
1144 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
1145 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
1146 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0
1147 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm1
1148 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1149 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm2
1150 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1151 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1152 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1153 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
1154 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1155 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1156 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1157 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm7
1158 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8
1159 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
1160 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
1161 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1162 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1163 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11
1164 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1165 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1166 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1167 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1168 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1169 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1170 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11
1171 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1172 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1173 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1174 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1175 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1176 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1177 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1178 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
1179 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1180 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1181 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1182 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1183 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1184 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1185 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1186 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1187 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12
1188 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1189 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1190 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1191 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1192 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14
1193 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1194 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1195 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1196 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1197 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1198 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1199 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1200 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1201 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
1202 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1203 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1204 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1205 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1206 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1207 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1208 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1209 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsi)
1210 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rdx)
1211 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, (%rcx)
1212 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%r8)
1213 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%r9)
1214 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%r10)
1215 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rax)
1216 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1217 ; AVX2-FAST-PERLANE-NEXT: retq
1219 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf8:
1220 ; AVX512F-SLOW: # %bb.0:
1221 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1222 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1223 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0
1224 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1
1225 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1226 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
1227 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1228 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1229 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4
1230 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
1231 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1232 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
1233 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1234 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1235 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1236 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1237 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1238 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
1239 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
1240 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1241 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1242 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1243 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1244 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1245 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1246 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1247 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
1248 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1249 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1250 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1251 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1252 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1253 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1254 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1255 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1256 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1257 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1258 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1259 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7]
1260 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1261 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1262 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1263 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1264 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1265 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1266 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
1267 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1268 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
1269 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1270 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1271 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1272 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
1273 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
1274 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
1275 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1276 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
1277 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1278 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1279 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1280 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
1281 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6]
1282 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1283 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1284 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
1285 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
1286 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
1287 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
1288 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1289 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1290 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1291 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1292 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1293 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%rsi)
1294 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rdx)
1295 ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rcx)
1296 ; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%r8)
1297 ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r9)
1298 ; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r10)
1299 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax)
1300 ; AVX512F-SLOW-NEXT: vzeroupper
1301 ; AVX512F-SLOW-NEXT: retq
1303 ; AVX512F-FAST-LABEL: load_i16_stride7_vf8:
1304 ; AVX512F-FAST: # %bb.0:
1305 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
1306 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
1307 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0
1308 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
1309 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1310 ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2
1311 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1312 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,12,13,10,11,4,5]
1313 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4
1314 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
1315 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1316 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
1317 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
1318 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u]
1319 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1320 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1321 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1322 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5]
1323 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1324 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
1325 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1326 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1327 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1328 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1329 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7]
1330 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1331 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
1332 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1333 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1334 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1335 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1336 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1337 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1338 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1339 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
1340 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1341 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u]
1342 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1343 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1344 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1345 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1346 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
1347 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12
1348 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
1349 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
1350 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1351 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1352 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1353 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
1354 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11
1355 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1356 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1357 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1358 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1359 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1360 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1361 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1362 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
1363 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1364 ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
1365 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1366 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1367 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1368 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1369 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1370 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%rsi)
1371 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%rdx)
1372 ; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rcx)
1373 ; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r8)
1374 ; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r9)
1375 ; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r10)
1376 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax)
1377 ; AVX512F-FAST-NEXT: vzeroupper
1378 ; AVX512F-FAST-NEXT: retq
1380 ; AVX512BW-LABEL: load_i16_stride7_vf8:
1381 ; AVX512BW: # %bb.0:
1382 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1383 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1384 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1385 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1386 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1387 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1388 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1389 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1390 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1391 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1392 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1393 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1394 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1395 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1396 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1397 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1398 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1399 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1400 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
1401 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
1402 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
1403 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
1404 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1405 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r10)
1406 ; AVX512BW-NEXT: vmovdqa %xmm8, (%rax)
1407 ; AVX512BW-NEXT: vzeroupper
1408 ; AVX512BW-NEXT: retq
1409 %wide.vec = load <56 x i16>, ptr %in.vec, align 64
1410 %strided.vec0 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
1411 %strided.vec1 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
1412 %strided.vec2 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
1413 %strided.vec3 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
1414 %strided.vec4 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
1415 %strided.vec5 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
1416 %strided.vec6 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
1417 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1418 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1419 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1420 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1421 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1422 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
1423 store <8 x i16> %strided.vec6, ptr %out.vec6, align 64
1427 define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1428 ; SSE-LABEL: load_i16_stride7_vf16:
1430 ; SSE-NEXT: subq $232, %rsp
1431 ; SSE-NEXT: movdqa 80(%rdi), %xmm11
1432 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1433 ; SSE-NEXT: movdqa 64(%rdi), %xmm9
1434 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1435 ; SSE-NEXT: movdqa 112(%rdi), %xmm12
1436 ; SSE-NEXT: movdqa 128(%rdi), %xmm6
1437 ; SSE-NEXT: movaps 160(%rdi), %xmm5
1438 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill
1439 ; SSE-NEXT: movaps 144(%rdi), %xmm7
1440 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1441 ; SSE-NEXT: movdqa 192(%rdi), %xmm13
1442 ; SSE-NEXT: movdqa 176(%rdi), %xmm15
1443 ; SSE-NEXT: movdqa 208(%rdi), %xmm14
1444 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
1445 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1446 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
1447 ; SSE-NEXT: movdqa %xmm1, %xmm2
1448 ; SSE-NEXT: pandn %xmm0, %xmm2
1449 ; SSE-NEXT: movdqa %xmm15, %xmm0
1450 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
1451 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1452 ; SSE-NEXT: pand %xmm1, %xmm0
1453 ; SSE-NEXT: por %xmm2, %xmm0
1454 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
1455 ; SSE-NEXT: movdqa %xmm3, %xmm2
1456 ; SSE-NEXT: movdqa %xmm3, %xmm10
1457 ; SSE-NEXT: pandn %xmm0, %xmm2
1458 ; SSE-NEXT: movaps %xmm7, %xmm0
1459 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2]
1460 ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
1461 ; SSE-NEXT: movaps %xmm8, %xmm4
1462 ; SSE-NEXT: andnps %xmm0, %xmm4
1463 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
1464 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1465 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3]
1466 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1467 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7]
1468 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1469 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
1470 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1471 ; SSE-NEXT: pand %xmm8, %xmm3
1472 ; SSE-NEXT: por %xmm4, %xmm3
1473 ; SSE-NEXT: pand %xmm10, %xmm3
1474 ; SSE-NEXT: por %xmm2, %xmm3
1475 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1476 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1477 ; SSE-NEXT: movdqa %xmm1, %xmm2
1478 ; SSE-NEXT: pandn %xmm0, %xmm2
1479 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
1480 ; SSE-NEXT: pand %xmm1, %xmm9
1481 ; SSE-NEXT: por %xmm2, %xmm9
1482 ; SSE-NEXT: movdqa %xmm10, %xmm2
1483 ; SSE-NEXT: pandn %xmm9, %xmm2
1484 ; SSE-NEXT: movaps 32(%rdi), %xmm0
1485 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1486 ; SSE-NEXT: movaps 48(%rdi), %xmm3
1487 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1488 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
1489 ; SSE-NEXT: movaps %xmm8, %xmm4
1490 ; SSE-NEXT: andnps %xmm0, %xmm4
1491 ; SSE-NEXT: movdqa (%rdi), %xmm0
1492 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1493 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1494 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7]
1495 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1496 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1497 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1498 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1499 ; SSE-NEXT: pand %xmm8, %xmm3
1500 ; SSE-NEXT: por %xmm4, %xmm3
1501 ; SSE-NEXT: pand %xmm10, %xmm3
1502 ; SSE-NEXT: por %xmm2, %xmm3
1503 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1504 ; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
1505 ; SSE-NEXT: movdqa %xmm1, %xmm2
1506 ; SSE-NEXT: pandn %xmm14, %xmm2
1507 ; SSE-NEXT: psrld $16, %xmm13
1508 ; SSE-NEXT: movdqa %xmm15, %xmm4
1509 ; SSE-NEXT: movdqa %xmm15, %xmm11
1510 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1511 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1512 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
1513 ; SSE-NEXT: pand %xmm1, %xmm4
1514 ; SSE-NEXT: por %xmm2, %xmm4
1515 ; SSE-NEXT: movdqa %xmm10, %xmm0
1516 ; SSE-NEXT: movdqa %xmm10, %xmm14
1517 ; SSE-NEXT: pandn %xmm4, %xmm0
1518 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
1519 ; SSE-NEXT: movdqa %xmm7, %xmm4
1520 ; SSE-NEXT: pandn %xmm12, %xmm4
1521 ; SSE-NEXT: pand %xmm7, %xmm6
1522 ; SSE-NEXT: por %xmm4, %xmm6
1523 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1524 ; SSE-NEXT: movdqa %xmm9, %xmm4
1525 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload
1526 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
1527 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
1528 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
1529 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
1530 ; SSE-NEXT: movdqa %xmm8, %xmm2
1531 ; SSE-NEXT: pandn %xmm4, %xmm2
1532 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
1533 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
1534 ; SSE-NEXT: pand %xmm8, %xmm4
1535 ; SSE-NEXT: por %xmm4, %xmm2
1536 ; SSE-NEXT: pand %xmm10, %xmm2
1537 ; SSE-NEXT: por %xmm0, %xmm2
1538 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1539 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1540 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1541 ; SSE-NEXT: movdqa %xmm1, %xmm4
1542 ; SSE-NEXT: pandn %xmm0, %xmm4
1543 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1544 ; SSE-NEXT: movdqa %xmm3, %xmm0
1545 ; SSE-NEXT: psrld $16, %xmm0
1546 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1547 ; SSE-NEXT: movdqa %xmm15, %xmm5
1548 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1549 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
1550 ; SSE-NEXT: pand %xmm1, %xmm5
1551 ; SSE-NEXT: por %xmm4, %xmm5
1552 ; SSE-NEXT: movdqa %xmm10, %xmm0
1553 ; SSE-NEXT: pandn %xmm5, %xmm0
1554 ; SSE-NEXT: movdqa %xmm7, %xmm4
1555 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1556 ; SSE-NEXT: pandn %xmm10, %xmm4
1557 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1558 ; SSE-NEXT: movdqa %xmm12, %xmm5
1559 ; SSE-NEXT: pand %xmm7, %xmm5
1560 ; SSE-NEXT: por %xmm4, %xmm5
1561 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
1562 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
1563 ; SSE-NEXT: pand %xmm8, %xmm4
1564 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1565 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1566 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
1567 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1]
1568 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
1569 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
1570 ; SSE-NEXT: pandn %xmm5, %xmm8
1571 ; SSE-NEXT: por %xmm4, %xmm8
1572 ; SSE-NEXT: pand %xmm14, %xmm8
1573 ; SSE-NEXT: por %xmm0, %xmm8
1574 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1575 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1]
1577 ; SSE-NEXT: movdqa %xmm1, %xmm4
1578 ; SSE-NEXT: pandn %xmm0, %xmm4
1579 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1580 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
1581 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1]
1582 ; SSE-NEXT: pand %xmm1, %xmm0
1583 ; SSE-NEXT: por %xmm4, %xmm0
1584 ; SSE-NEXT: movdqa %xmm14, %xmm4
1585 ; SSE-NEXT: pandn %xmm0, %xmm4
1586 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3]
1587 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7]
1588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
1589 ; SSE-NEXT: movdqa %xmm13, %xmm2
1590 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1592 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1593 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
1594 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3]
1595 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1596 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3]
1597 ; SSE-NEXT: andps %xmm14, %xmm11
1598 ; SSE-NEXT: orps %xmm4, %xmm11
1599 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1600 ; SSE-NEXT: movdqa %xmm15, %xmm0
1601 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1602 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1603 ; SSE-NEXT: pand %xmm1, %xmm0
1604 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1605 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
1606 ; SSE-NEXT: pandn %xmm4, %xmm1
1607 ; SSE-NEXT: por %xmm0, %xmm1
1608 ; SSE-NEXT: movdqa %xmm14, %xmm0
1609 ; SSE-NEXT: movaps %xmm14, %xmm15
1610 ; SSE-NEXT: pandn %xmm1, %xmm0
1611 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1612 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3]
1613 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
1614 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1615 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3]
1616 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
1617 ; SSE-NEXT: movdqa %xmm10, %xmm1
1618 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
1619 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
1620 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
1621 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3]
1622 ; SSE-NEXT: andps %xmm15, %xmm4
1623 ; SSE-NEXT: orps %xmm0, %xmm4
1624 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; SSE-NEXT: movdqa %xmm7, %xmm0
1626 ; SSE-NEXT: pandn %xmm9, %xmm0
1627 ; SSE-NEXT: pand %xmm7, %xmm2
1628 ; SSE-NEXT: por %xmm0, %xmm2
1629 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7]
1630 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1631 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
1632 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
1633 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
1634 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1635 ; SSE-NEXT: movdqa %xmm4, %xmm5
1636 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
1637 ; SSE-NEXT: movdqa %xmm6, %xmm12
1638 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
1639 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
1640 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1641 ; SSE-NEXT: movdqa %xmm8, %xmm6
1642 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
1643 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
1644 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
1645 ; SSE-NEXT: movaps %xmm15, %xmm2
1646 ; SSE-NEXT: andnps %xmm5, %xmm2
1647 ; SSE-NEXT: andps %xmm15, %xmm0
1648 ; SSE-NEXT: orps %xmm0, %xmm2
1649 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650 ; SSE-NEXT: movdqa %xmm7, %xmm0
1651 ; SSE-NEXT: pandn %xmm13, %xmm0
1652 ; SSE-NEXT: movdqa %xmm13, %xmm9
1653 ; SSE-NEXT: movdqa %xmm14, %xmm5
1654 ; SSE-NEXT: movdqa %xmm14, %xmm13
1655 ; SSE-NEXT: pand %xmm7, %xmm5
1656 ; SSE-NEXT: por %xmm0, %xmm5
1657 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7]
1658 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1659 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1660 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1661 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1662 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1663 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1664 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
1665 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1666 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
1667 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1668 ; SSE-NEXT: andps %xmm15, %xmm0
1669 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1670 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1671 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
1672 ; SSE-NEXT: andnps %xmm1, %xmm15
1673 ; SSE-NEXT: orps %xmm0, %xmm15
1674 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1675 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1676 ; SSE-NEXT: movdqa %xmm0, %xmm14
1677 ; SSE-NEXT: psrld $16, %xmm14
1678 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1679 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
1680 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1681 ; SSE-NEXT: movdqa %xmm0, %xmm5
1682 ; SSE-NEXT: movdqa %xmm2, %xmm0
1683 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1684 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1685 ; SSE-NEXT: psrlq $48, %xmm0
1686 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1687 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1688 ; SSE-NEXT: movdqa %xmm8, %xmm0
1689 ; SSE-NEXT: psrlq $16, %xmm0
1690 ; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload
1691 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1692 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1693 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1694 ; SSE-NEXT: movdqa %xmm4, %xmm5
1695 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3]
1696 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
1697 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1698 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1699 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
1700 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1701 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1702 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
1703 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1704 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1705 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1706 ; SSE-NEXT: movdqa %xmm10, %xmm15
1707 ; SSE-NEXT: psrld $16, %xmm15
1708 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1709 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
1710 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
1711 ; SSE-NEXT: movdqa %xmm2, %xmm1
1712 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
1713 ; SSE-NEXT: psrlq $48, %xmm1
1714 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1715 ; SSE-NEXT: movdqa %xmm9, %xmm4
1716 ; SSE-NEXT: movdqa %xmm9, %xmm1
1717 ; SSE-NEXT: psrlq $16, %xmm1
1718 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1719 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
1720 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1721 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1722 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
1723 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1724 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1725 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1726 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1727 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
1728 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1729 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1730 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1731 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1732 ; SSE-NEXT: movdqa %xmm8, %xmm0
1733 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1734 ; SSE-NEXT: movdqa %xmm11, %xmm8
1735 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1736 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1737 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
1738 ; SSE-NEXT: movdqa %xmm7, %xmm0
1739 ; SSE-NEXT: pandn %xmm5, %xmm0
1740 ; SSE-NEXT: movdqa %xmm12, %xmm11
1741 ; SSE-NEXT: pand %xmm7, %xmm11
1742 ; SSE-NEXT: por %xmm0, %xmm11
1743 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
1744 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
1745 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1746 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1747 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2]
1748 ; SSE-NEXT: pand %xmm7, %xmm2
1749 ; SSE-NEXT: movdqa %xmm9, %xmm11
1750 ; SSE-NEXT: pandn %xmm9, %xmm7
1751 ; SSE-NEXT: por %xmm2, %xmm7
1752 ; SSE-NEXT: movdqa %xmm4, %xmm0
1753 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
1754 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1755 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1756 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1757 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
1758 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
1759 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1760 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1761 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2]
1762 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1763 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1764 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1765 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1766 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1767 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1768 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
1769 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
1770 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1771 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7]
1772 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1773 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
1774 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
1775 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
1776 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
1777 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1778 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1]
1779 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1780 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
1781 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
1782 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3]
1783 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
1784 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
1785 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1786 ; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7]
1787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1788 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
1789 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
1790 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1791 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1792 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1]
1793 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1794 ; SSE-NEXT: movaps %xmm2, (%rsi)
1795 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1796 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1798 ; SSE-NEXT: movaps %xmm2, (%rdx)
1799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1800 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
1801 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1802 ; SSE-NEXT: movaps %xmm2, (%rcx)
1803 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1804 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
1805 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1806 ; SSE-NEXT: movaps %xmm2, (%r8)
1807 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1808 ; SSE-NEXT: movaps %xmm2, 16(%r8)
1809 ; SSE-NEXT: movapd %xmm1, (%r9)
1810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1811 ; SSE-NEXT: movaps %xmm1, 16(%r9)
1812 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1813 ; SSE-NEXT: movaps %xmm15, (%rax)
1814 ; SSE-NEXT: movaps %xmm14, 16(%rax)
1815 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1816 ; SSE-NEXT: movapd %xmm11, (%rax)
1817 ; SSE-NEXT: movapd %xmm0, 16(%rax)
1818 ; SSE-NEXT: addq $232, %rsp
1821 ; AVX1-ONLY-LABEL: load_i16_stride7_vf16:
1822 ; AVX1-ONLY: # %bb.0:
1823 ; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108
1824 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0
1825 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1826 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0
1827 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm12
1828 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
1829 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1830 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
1831 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1832 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
1833 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
1834 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1835 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8
1836 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1837 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2
1838 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1839 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1
1840 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1841 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1842 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1843 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1844 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1845 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5
1846 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
1847 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1848 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1849 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1850 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
1851 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
1852 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7
1853 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7]
1854 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6
1855 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3]
1856 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
1857 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11
1858 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
1859 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1860 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9
1861 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
1862 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10
1863 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
1864 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7]
1865 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1866 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1
1867 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2
1868 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero
1869 ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13
1870 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1871 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1872 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7]
1873 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
1874 ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm15, %ymm11
1875 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9
1876 ; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm9
1877 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1878 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7]
1879 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1880 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1881 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
1882 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4
1883 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1884 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1885 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
1886 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7]
1887 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
1888 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7]
1889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
1890 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1891 ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm9
1892 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1893 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1894 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
1895 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9
1896 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1897 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1898 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1899 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
1900 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7]
1902 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm11
1903 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5
1904 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
1906 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
1907 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
1908 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1909 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7]
1910 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm13
1911 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1912 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3]
1913 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7]
1914 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7]
1915 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1916 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm9
1917 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm15
1918 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm15, %ymm9
1919 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1920 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
1921 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0
1922 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm9
1923 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0
1924 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1925 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm0
1926 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1927 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15
1928 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1929 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
1930 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3]
1931 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
1932 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7]
1934 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
1935 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1936 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11
1937 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
1938 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4
1939 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
1940 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1941 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2]
1942 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm2
1943 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7]
1944 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1]
1945 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
1946 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1]
1947 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
1948 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1949 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3]
1950 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
1951 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1952 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3]
1953 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1]
1954 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1955 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3]
1956 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7]
1958 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1959 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9
1960 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
1961 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1
1962 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1963 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
1964 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
1965 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
1966 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
1967 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1968 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
1969 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1970 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
1971 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1972 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7]
1973 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1974 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
1975 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7]
1976 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1977 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7]
1978 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
1979 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7]
1980 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7]
1981 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
1982 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13
1983 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7]
1984 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7]
1985 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1986 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7]
1987 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1988 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11
1989 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
1990 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7]
1991 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1992 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
1993 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
1994 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1995 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm12
1996 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
1997 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
1998 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9
1999 ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10
2000 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm9
2001 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2002 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2003 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1
2004 ; AVX1-ONLY-NEXT: vandps %ymm15, %ymm9, %ymm9
2005 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1
2006 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2007 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
2008 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
2009 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
2010 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3]
2011 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
2012 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2013 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
2014 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7]
2015 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2016 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2017 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2018 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2019 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm1
2020 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2021 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
2022 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
2023 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm9
2024 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
2025 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2026 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3]
2027 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
2028 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2029 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2030 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2031 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
2032 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
2033 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2034 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2035 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2036 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0
2037 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1
2038 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
2039 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2040 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
2041 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
2042 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm1
2043 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15
2044 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2045 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2046 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2047 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2048 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2049 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2050 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7]
2051 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2052 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
2053 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2054 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1
2055 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2056 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2057 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
2058 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
2059 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2060 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7]
2061 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
2062 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7]
2063 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2064 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7]
2065 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2]
2066 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2067 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2068 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2069 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2070 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2071 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0
2072 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1
2073 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
2074 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2075 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2076 ; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero
2077 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
2078 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7]
2079 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7]
2080 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2081 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2082 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2083 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1]
2084 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2085 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2086 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2087 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2088 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
2089 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
2090 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
2091 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
2092 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
2093 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3]
2094 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2095 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
2096 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3]
2097 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2098 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
2099 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2100 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1
2101 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2
2102 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
2103 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2104 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi)
2105 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2106 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx)
2107 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2108 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx)
2109 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2110 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8)
2111 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2112 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9)
2113 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
2114 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax)
2115 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
2116 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax)
2117 ; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108
2118 ; AVX1-ONLY-NEXT: vzeroupper
2119 ; AVX1-ONLY-NEXT: retq
2121 ; AVX2-SLOW-LABEL: load_i16_stride7_vf16:
2122 ; AVX2-SLOW: # %bb.0:
2123 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2124 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2125 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
2126 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
2127 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9
2128 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5
2129 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6
2130 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2131 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,4,7]
2132 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
2133 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
2134 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
2135 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
2136 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2137 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[0,1,0,2]
2138 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,1,4,5,6,5]
2139 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2140 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2141 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2142 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2143 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6],xmm8[7]
2144 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2145 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2146 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
2147 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2148 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
2149 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
2150 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2151 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2152 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2153 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2154 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
2155 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2156 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2157 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2158 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15]
2159 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2160 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
2161 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8
2162 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2163 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2164 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
2165 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2166 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2167 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2168 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
2169 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
2170 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2171 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2172 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2173 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11
2174 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7]
2175 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2176 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
2177 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2178 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4],ymm12[5,6,7,8,9,10,11],ymm11[12],ymm12[13,14,15]
2179 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2180 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8
2181 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2182 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2183 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
2184 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2185 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2186 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,2]
2187 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7]
2188 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2189 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
2190 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
2191 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
2192 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2193 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2194 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2195 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
2196 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
2197 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2198 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
2199 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
2200 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
2201 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
2202 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2203 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
2204 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7]
2205 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,2,1,0,4,5,6,7]
2206 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
2207 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2208 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,3,2,3]
2209 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2210 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
2211 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2212 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2213 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2214 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2215 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
2216 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2217 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2218 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
2219 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2220 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
2221 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2222 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2223 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
2224 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2225 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
2226 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
2227 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
2228 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
2229 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2230 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1]
2231 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15]
2232 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2233 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2234 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2235 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2236 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
2237 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2238 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2239 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12
2240 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13
2241 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2242 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2243 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
2244 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2245 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
2246 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2247 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2248 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2249 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2250 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4
2251 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2252 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2253 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
2254 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2255 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
2256 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2257 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
2258 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3]
2259 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
2260 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2261 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2262 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2263 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2264 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2265 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2266 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2267 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2268 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2269 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2270 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2271 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2272 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2273 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2274 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
2275 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
2276 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2277 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2278 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2279 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2280 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
2281 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2282 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2283 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2284 ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi)
2285 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
2286 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
2287 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
2288 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9)
2289 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2290 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax)
2291 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2292 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax)
2293 ; AVX2-SLOW-NEXT: vzeroupper
2294 ; AVX2-SLOW-NEXT: retq
2296 ; AVX2-FAST-LABEL: load_i16_stride7_vf16:
2297 ; AVX2-FAST: # %bb.0:
2298 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
2299 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
2300 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2
2301 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3
2302 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
2303 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6
2304 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7
2305 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2]
2306 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
2307 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
2308 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5]
2309 ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1]
2310 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
2311 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
2312 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
2313 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2314 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
2315 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
2316 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2317 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2318 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
2319 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2320 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
2321 ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2322 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
2323 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2324 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2325 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
2326 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
2327 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2328 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2329 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,5,1,u,4,u,u,u>
2330 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11
2331 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
2332 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
2333 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10
2334 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
2335 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2336 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
2337 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2338 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2339 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2340 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
2341 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
2342 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7]
2343 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2344 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2345 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
2346 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7]
2347 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2348 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2349 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,6,1,u,5,u,u,u>
2350 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12
2351 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
2352 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10
2353 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
2354 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2355 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
2356 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2357 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2358 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5]
2359 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm13
2360 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2361 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
2362 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15]
2363 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
2364 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
2365 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
2366 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
2367 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2368 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2369 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,1,1,3]
2370 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2371 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
2372 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2373 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2374 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
2375 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
2376 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2377 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,3,2,3]
2378 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2379 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
2380 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15]
2381 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
2382 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2383 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
2384 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15
2385 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14
2386 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
2387 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2388 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2389 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u>
2390 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5
2391 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
2392 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9
2393 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7]
2394 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
2395 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2396 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2397 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
2398 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
2399 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5
2400 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3]
2401 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3,4,5,6,7],ymm5[8],ymm9[9,10,11,12,13,14,15]
2402 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7]
2403 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
2404 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4]
2405 ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
2406 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5
2407 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0]
2408 ; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1]
2409 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14
2410 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
2411 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
2412 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15]
2413 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2414 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u>
2415 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14
2416 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2417 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8
2418 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8
2419 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
2420 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
2421 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
2422 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15]
2423 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3]
2424 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
2425 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0]
2426 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
2427 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
2428 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
2429 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5]
2430 ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
2431 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4
2432 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
2433 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
2434 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15]
2435 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2436 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u>
2437 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
2438 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2
2439 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2440 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
2441 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
2442 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
2443 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2444 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
2445 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2446 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2447 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2448 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
2449 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2450 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx)
2451 ; AVX2-FAST-NEXT: vmovdqa %ymm10, (%rcx)
2452 ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8)
2453 ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9)
2454 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2455 ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax)
2456 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2457 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
2458 ; AVX2-FAST-NEXT: vzeroupper
2459 ; AVX2-FAST-NEXT: retq
2461 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf16:
2462 ; AVX2-FAST-PERLANE: # %bb.0:
2463 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
2464 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2465 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2
2466 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3
2467 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10
2468 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm5
2469 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6
2470 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2471 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7
2472 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,8,9,10,11,6,7,6,7]
2473 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7]
2474 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
2475 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2476 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2]
2477 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2478 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8
2479 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2480 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2481 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
2482 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7]
2483 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2484 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2485 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
2486 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2487 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
2488 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
2489 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2490 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2491 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2492 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
2493 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7]
2494 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2495 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2496 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2497 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15]
2498 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2499 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
2500 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8
2501 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2502 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2503 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
2504 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2505 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2506 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2507 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
2508 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2509 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2510 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2511 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2512 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12
2513 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7]
2514 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2515 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
2516 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2517 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15]
2518 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2519 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8
2520 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2521 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2522 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
2523 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2524 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2525 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2]
2526 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
2527 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
2528 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15]
2529 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
2530 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2531 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13
2532 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
2533 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2534 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13
2535 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3]
2536 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2537 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2538 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2539 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14
2540 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
2541 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
2542 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2543 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3]
2544 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2545 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11
2546 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
2547 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2548 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2549 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
2550 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14
2551 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13
2552 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
2553 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2554 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2555 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2556 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2557 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15]
2558 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
2559 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2560 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm4
2561 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7]
2562 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2563 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2564 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9
2565 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
2566 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15]
2567 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7]
2568 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2569 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12
2570 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7]
2571 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2572 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2573 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm12
2574 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13
2575 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2576 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,0,1,14,15,12,13]
2577 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2578 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15]
2579 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2580 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2581 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2582 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2583 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm7
2584 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7
2585 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
2586 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
2587 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2588 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15]
2589 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,3]
2590 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2591 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2592 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
2593 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2594 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2595 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2596 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2597 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15]
2598 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2599 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2600 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2601 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2602 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2603 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2604 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
2605 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
2606 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
2607 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2608 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2609 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
2610 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2611 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2612 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2613 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
2614 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2615 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx)
2616 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx)
2617 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8)
2618 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9)
2619 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2620 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax)
2621 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2622 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax)
2623 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
2624 ; AVX2-FAST-PERLANE-NEXT: retq
2626 ; AVX512F-SLOW-LABEL: load_i16_stride7_vf16:
2627 ; AVX512F-SLOW: # %bb.0:
2628 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
2629 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
2630 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
2631 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
2632 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2633 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2634 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2635 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2636 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
2637 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
2638 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u]
2639 ; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4
2640 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5
2641 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6
2642 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2643 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,4,7]
2644 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
2645 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
2646 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7]
2647 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2648 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2649 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9
2650 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,2]
2651 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5]
2652 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2653 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
2654 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
2655 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17
2656 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2657 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10
2658 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7]
2659 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2660 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2661 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
2662 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
2663 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
2664 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8
2665 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2666 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15]
2667 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2668 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2669 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2670 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
2671 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2672 ; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8
2673 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2674 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2675 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2676 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
2677 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
2678 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2679 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2680 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2]
2681 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
2682 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2683 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
2684 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2685 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2686 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15]
2687 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2688 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2689 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2690 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2691 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2692 ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10
2693 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
2694 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
2695 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2696 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3]
2697 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2698 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2699 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2700 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
2701 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2702 ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10
2703 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2704 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2705 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2706 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
2707 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
2708 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2709 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
2710 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
2711 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
2712 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
2713 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2714 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2715 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2716 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11
2717 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
2718 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2719 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2720 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
2721 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2722 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
2723 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2724 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2725 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2726 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15]
2727 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2728 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
2729 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
2730 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
2731 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
2732 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2733 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
2734 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
2735 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2736 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2737 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
2738 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
2739 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2740 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2741 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12
2742 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13
2743 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2744 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2745 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
2746 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2747 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
2748 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2749 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2750 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2751 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2752 ; AVX512F-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
2753 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm16[2,1,2,3]
2754 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2755 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
2756 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2757 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
2758 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2759 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7]
2760 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15]
2761 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
2762 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2763 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2764 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2765 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2766 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2767 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2768 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2769 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2770 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2771 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2772 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2773 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2774 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2775 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2776 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
2777 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
2778 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2779 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2780 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2781 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2782 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
2783 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
2784 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2785 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi)
2786 ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
2787 ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
2788 ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8)
2789 ; AVX512F-SLOW-NEXT: vmovdqa %ymm10, (%r9)
2790 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2791 ; AVX512F-SLOW-NEXT: vmovdqa %ymm11, (%rax)
2792 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2793 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax)
2794 ; AVX512F-SLOW-NEXT: vzeroupper
2795 ; AVX512F-SLOW-NEXT: retq
2797 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16:
2798 ; AVX512F-ONLY-FAST: # %bb.0:
2799 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
2800 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
2801 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
2802 ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
2803 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15]
2804 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
2805 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8
2806 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
2807 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
2808 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
2809 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
2810 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
2811 ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
2812 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
2813 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
2814 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
2815 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2816 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5
2817 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
2818 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
2819 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2820 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4
2821 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
2822 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
2823 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
2824 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
2825 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
2826 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
2827 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
2828 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
2829 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
2830 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
2831 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2832 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
2833 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2834 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2835 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2836 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
2837 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2838 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
2839 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2840 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
2841 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2842 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
2843 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
2844 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
2845 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
2846 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2847 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
2848 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2849 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2850 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
2851 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
2852 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2853 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
2854 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2855 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
2856 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2857 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
2858 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2859 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8
2860 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15]
2861 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
2862 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
2863 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2864 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
2865 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2866 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2867 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
2868 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2869 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7]
2870 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
2871 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7
2872 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7]
2873 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
2874 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2875 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2876 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7
2877 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13
2878 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u>
2879 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15]
2880 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7]
2881 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
2882 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
2883 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
2884 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
2885 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2886 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2887 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10
2888 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7]
2889 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
2890 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
2891 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14
2892 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
2893 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
2894 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
2895 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
2896 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9
2897 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7]
2898 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15]
2899 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
2900 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u>
2901 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11
2902 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
2903 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
2904 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
2905 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
2906 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2907 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
2908 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
2909 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
2910 ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1]
2911 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
2912 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
2913 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
2914 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13
2915 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
2916 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
2917 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
2918 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
2919 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
2920 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
2921 ; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1]
2922 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2
2923 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
2924 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
2925 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
2926 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
2927 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0
2928 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
2929 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
2930 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
2931 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
2932 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2933 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
2934 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
2935 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2936 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi)
2937 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx)
2938 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%rcx)
2939 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8)
2940 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%r9)
2941 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2942 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, (%rax)
2943 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
2944 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax)
2945 ; AVX512F-ONLY-FAST-NEXT: vzeroupper
2946 ; AVX512F-ONLY-FAST-NEXT: retq
2948 ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16:
2949 ; AVX512DQ-FAST: # %bb.0:
2950 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0
2951 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1
2952 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
2953 ; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1]
2954 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15]
2955 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
2956 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8
2957 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
2958 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6
2959 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
2960 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3
2961 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13]
2962 ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
2963 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4
2964 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2
2965 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2]
2966 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2967 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5
2968 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
2969 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7]
2970 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2971 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4
2972 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5
2973 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
2974 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
2975 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7]
2976 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u]
2977 ; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3
2978 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7]
2979 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
2980 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
2981 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
2982 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2983 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7]
2984 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2985 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2986 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2987 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
2988 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2989 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
2990 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
2991 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
2992 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2993 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6
2994 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
2995 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
2996 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
2997 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2998 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
2999 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3000 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3001 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
3002 ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14
3003 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3004 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7]
3005 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3006 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
3007 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
3008 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
3009 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3010 ; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8
3011 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15]
3012 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
3013 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
3014 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
3015 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
3016 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
3017 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3018 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3]
3019 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
3020 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7]
3021 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
3022 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7
3023 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7]
3024 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
3025 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3026 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3027 ; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7
3028 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13
3029 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u>
3030 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15]
3031 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7]
3032 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9
3033 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
3034 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
3035 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
3036 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3037 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3038 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10
3039 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7]
3040 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
3041 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
3042 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14
3043 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10
3044 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
3045 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
3046 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
3047 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9
3048 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7]
3049 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15]
3050 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
3051 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u>
3052 ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11
3053 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
3054 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
3055 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
3056 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
3057 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
3058 ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12
3059 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
3060 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
3061 ; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1]
3062 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
3063 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
3064 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
3065 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13
3066 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
3067 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
3068 ; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1
3069 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
3070 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
3071 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
3072 ; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1]
3073 ; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2
3074 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
3075 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
3076 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3077 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0
3078 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0
3079 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
3080 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
3081 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
3082 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
3083 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3084 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
3085 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
3086 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3087 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi)
3088 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx)
3089 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%rcx)
3090 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8)
3091 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%r9)
3092 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
3093 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, (%rax)
3094 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
3095 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax)
3096 ; AVX512DQ-FAST-NEXT: vzeroupper
3097 ; AVX512DQ-FAST-NEXT: retq
3099 ; AVX512BW-LABEL: load_i16_stride7_vf16:
3100 ; AVX512BW: # %bb.0:
3101 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3102 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3103 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3104 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3105 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3106 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3107 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
3108 ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
3109 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3110 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
3111 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3112 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
3113 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
3114 ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
3115 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3116 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
3117 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3118 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
3119 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3120 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
3121 ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
3122 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3123 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
3124 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3125 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
3126 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3127 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
3128 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
3129 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3130 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
3131 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3132 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
3133 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3134 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
3135 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
3136 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3137 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
3138 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
3139 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
3140 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3141 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
3142 ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
3143 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3144 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
3145 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
3146 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
3147 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3148 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
3149 ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
3150 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3151 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
3152 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3153 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
3154 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3155 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3156 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3157 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
3158 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
3159 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
3160 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r10)
3161 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
3162 ; AVX512BW-NEXT: vzeroupper
3163 ; AVX512BW-NEXT: retq
3164 %wide.vec = load <112 x i16>, ptr %in.vec, align 64
3165 %strided.vec0 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
3166 %strided.vec1 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
3167 %strided.vec2 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
3168 %strided.vec3 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
3169 %strided.vec4 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
3170 %strided.vec5 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
3171 %strided.vec6 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
3172 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
3173 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
3174 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
3175 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
3176 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
3177 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
3178 store <16 x i16> %strided.vec6, ptr %out.vec6, align 64
3182 define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
3183 ; SSE-LABEL: load_i16_stride7_vf32:
3185 ; SSE-NEXT: subq $600, %rsp # imm = 0x258
3186 ; SSE-NEXT: movdqa 304(%rdi), %xmm5
3187 ; SSE-NEXT: movdqa 288(%rdi), %xmm6
3188 ; SSE-NEXT: movdqa 112(%rdi), %xmm13
3189 ; SSE-NEXT: movdqa 128(%rdi), %xmm8
3190 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3191 ; SSE-NEXT: movaps 160(%rdi), %xmm7
3192 ; SSE-NEXT: movaps 144(%rdi), %xmm10
3193 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3194 ; SSE-NEXT: movdqa 192(%rdi), %xmm9
3195 ; SSE-NEXT: movdqa 176(%rdi), %xmm12
3196 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
3197 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
3198 ; SSE-NEXT: movdqa %xmm1, %xmm11
3199 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3200 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
3201 ; SSE-NEXT: movdqa %xmm2, %xmm1
3202 ; SSE-NEXT: pandn %xmm0, %xmm1
3203 ; SSE-NEXT: movdqa %xmm12, %xmm0
3204 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3205 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
3206 ; SSE-NEXT: pand %xmm2, %xmm0
3207 ; SSE-NEXT: por %xmm1, %xmm0
3208 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
3209 ; SSE-NEXT: movdqa %xmm15, %xmm1
3210 ; SSE-NEXT: pandn %xmm0, %xmm1
3211 ; SSE-NEXT: movaps %xmm10, %xmm0
3212 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2]
3213 ; SSE-NEXT: movaps %xmm7, %xmm10
3214 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3215 ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535]
3216 ; SSE-NEXT: movaps %xmm14, %xmm3
3217 ; SSE-NEXT: andnps %xmm0, %xmm3
3218 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
3219 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
3220 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3221 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
3222 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3223 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
3224 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3225 ; SSE-NEXT: pand %xmm14, %xmm4
3226 ; SSE-NEXT: por %xmm3, %xmm4
3227 ; SSE-NEXT: pand %xmm15, %xmm4
3228 ; SSE-NEXT: por %xmm1, %xmm4
3229 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3230 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3231 ; SSE-NEXT: movdqa %xmm2, %xmm1
3232 ; SSE-NEXT: pandn %xmm0, %xmm1
3233 ; SSE-NEXT: movdqa %xmm6, %xmm0
3234 ; SSE-NEXT: movdqa %xmm6, %xmm7
3235 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3236 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3237 ; SSE-NEXT: movdqa %xmm5, %xmm6
3238 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3239 ; SSE-NEXT: pand %xmm2, %xmm0
3240 ; SSE-NEXT: por %xmm1, %xmm0
3241 ; SSE-NEXT: movdqa %xmm15, %xmm1
3242 ; SSE-NEXT: pandn %xmm0, %xmm1
3243 ; SSE-NEXT: movaps 272(%rdi), %xmm3
3244 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3245 ; SSE-NEXT: movaps 256(%rdi), %xmm0
3246 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3247 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
3248 ; SSE-NEXT: movaps %xmm14, %xmm3
3249 ; SSE-NEXT: andnps %xmm0, %xmm3
3250 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
3251 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3252 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3253 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3254 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
3255 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3256 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3257 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3258 ; SSE-NEXT: pand %xmm14, %xmm4
3259 ; SSE-NEXT: por %xmm3, %xmm4
3260 ; SSE-NEXT: pand %xmm15, %xmm4
3261 ; SSE-NEXT: por %xmm1, %xmm4
3262 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3263 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
3264 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3265 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3266 ; SSE-NEXT: movdqa %xmm2, %xmm1
3267 ; SSE-NEXT: pandn %xmm0, %xmm1
3268 ; SSE-NEXT: movdqa 416(%rdi), %xmm3
3269 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3270 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
3271 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3272 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3273 ; SSE-NEXT: pand %xmm2, %xmm0
3274 ; SSE-NEXT: por %xmm1, %xmm0
3275 ; SSE-NEXT: movdqa %xmm15, %xmm1
3276 ; SSE-NEXT: pandn %xmm0, %xmm1
3277 ; SSE-NEXT: movaps 384(%rdi), %xmm3
3278 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3279 ; SSE-NEXT: movaps 368(%rdi), %xmm0
3280 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3281 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
3282 ; SSE-NEXT: movaps %xmm14, %xmm3
3283 ; SSE-NEXT: andnps %xmm0, %xmm3
3284 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
3285 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3286 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3287 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3288 ; SSE-NEXT: movdqa 352(%rdi), %xmm8
3289 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
3290 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3291 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3292 ; SSE-NEXT: pand %xmm14, %xmm4
3293 ; SSE-NEXT: por %xmm3, %xmm4
3294 ; SSE-NEXT: pand %xmm15, %xmm4
3295 ; SSE-NEXT: por %xmm1, %xmm4
3296 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3297 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
3298 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3300 ; SSE-NEXT: movdqa %xmm2, %xmm1
3301 ; SSE-NEXT: pandn %xmm0, %xmm1
3302 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
3303 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3304 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
3305 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3306 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
3307 ; SSE-NEXT: pand %xmm2, %xmm0
3308 ; SSE-NEXT: por %xmm1, %xmm0
3309 ; SSE-NEXT: movdqa %xmm15, %xmm1
3310 ; SSE-NEXT: pandn %xmm0, %xmm1
3311 ; SSE-NEXT: movaps 32(%rdi), %xmm0
3312 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3313 ; SSE-NEXT: movaps 48(%rdi), %xmm4
3314 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3315 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
3316 ; SSE-NEXT: movaps %xmm14, %xmm3
3317 ; SSE-NEXT: andnps %xmm0, %xmm3
3318 ; SSE-NEXT: movdqa (%rdi), %xmm0
3319 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3320 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
3321 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
3322 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3323 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3324 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
3325 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3326 ; SSE-NEXT: pand %xmm14, %xmm4
3327 ; SSE-NEXT: por %xmm3, %xmm4
3328 ; SSE-NEXT: pand %xmm15, %xmm4
3329 ; SSE-NEXT: por %xmm1, %xmm4
3330 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3331 ; SSE-NEXT: movdqa %xmm11, %xmm0
3332 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3333 ; SSE-NEXT: movdqa %xmm2, %xmm1
3334 ; SSE-NEXT: pandn %xmm0, %xmm1
3335 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3336 ; SSE-NEXT: movdqa %xmm9, %xmm0
3337 ; SSE-NEXT: psrld $16, %xmm0
3338 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3339 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
3340 ; SSE-NEXT: pand %xmm2, %xmm12
3341 ; SSE-NEXT: por %xmm1, %xmm12
3342 ; SSE-NEXT: movdqa %xmm15, %xmm0
3343 ; SSE-NEXT: pandn %xmm12, %xmm0
3344 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
3345 ; SSE-NEXT: movdqa %xmm11, %xmm1
3346 ; SSE-NEXT: pandn %xmm13, %xmm1
3347 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3348 ; SSE-NEXT: movdqa %xmm13, %xmm4
3349 ; SSE-NEXT: pand %xmm11, %xmm4
3350 ; SSE-NEXT: por %xmm1, %xmm4
3351 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3352 ; SSE-NEXT: movdqa %xmm12, %xmm1
3353 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3354 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3355 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3356 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3357 ; SSE-NEXT: movdqa %xmm14, %xmm3
3358 ; SSE-NEXT: pandn %xmm1, %xmm3
3359 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3360 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3361 ; SSE-NEXT: pand %xmm14, %xmm1
3362 ; SSE-NEXT: por %xmm1, %xmm3
3363 ; SSE-NEXT: pand %xmm15, %xmm3
3364 ; SSE-NEXT: por %xmm0, %xmm3
3365 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3366 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3367 ; SSE-NEXT: movdqa %xmm5, %xmm0
3368 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3369 ; SSE-NEXT: movdqa %xmm2, %xmm1
3370 ; SSE-NEXT: pandn %xmm0, %xmm1
3371 ; SSE-NEXT: psrld $16, %xmm6
3372 ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3373 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
3374 ; SSE-NEXT: pand %xmm2, %xmm7
3375 ; SSE-NEXT: por %xmm1, %xmm7
3376 ; SSE-NEXT: movdqa %xmm15, %xmm0
3377 ; SSE-NEXT: pandn %xmm7, %xmm0
3378 ; SSE-NEXT: movdqa %xmm11, %xmm1
3379 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3380 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3381 ; SSE-NEXT: pand %xmm11, %xmm4
3382 ; SSE-NEXT: por %xmm1, %xmm4
3383 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3384 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3385 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3386 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3387 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3388 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3389 ; SSE-NEXT: movdqa %xmm14, %xmm3
3390 ; SSE-NEXT: pandn %xmm1, %xmm3
3391 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3392 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3393 ; SSE-NEXT: pand %xmm14, %xmm1
3394 ; SSE-NEXT: por %xmm1, %xmm3
3395 ; SSE-NEXT: pand %xmm15, %xmm3
3396 ; SSE-NEXT: por %xmm0, %xmm3
3397 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3398 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3399 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3400 ; SSE-NEXT: movdqa %xmm2, %xmm1
3401 ; SSE-NEXT: pandn %xmm0, %xmm1
3402 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3403 ; SSE-NEXT: psrld $16, %xmm0
3404 ; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
3405 ; SSE-NEXT: movdqa %xmm7, %xmm4
3406 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3407 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3408 ; SSE-NEXT: pand %xmm2, %xmm4
3409 ; SSE-NEXT: por %xmm1, %xmm4
3410 ; SSE-NEXT: movdqa %xmm15, %xmm0
3411 ; SSE-NEXT: pandn %xmm4, %xmm0
3412 ; SSE-NEXT: movdqa %xmm11, %xmm1
3413 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3414 ; SSE-NEXT: movdqa %xmm8, %xmm4
3415 ; SSE-NEXT: pand %xmm11, %xmm4
3416 ; SSE-NEXT: por %xmm1, %xmm4
3417 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3418 ; SSE-NEXT: movdqa %xmm3, %xmm1
3419 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3420 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3421 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
3422 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3423 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
3424 ; SSE-NEXT: movdqa %xmm14, %xmm8
3425 ; SSE-NEXT: pandn %xmm1, %xmm8
3426 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3427 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3428 ; SSE-NEXT: pand %xmm14, %xmm1
3429 ; SSE-NEXT: por %xmm1, %xmm8
3430 ; SSE-NEXT: pand %xmm15, %xmm8
3431 ; SSE-NEXT: por %xmm0, %xmm8
3432 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3433 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3434 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3435 ; SSE-NEXT: movdqa %xmm2, %xmm1
3436 ; SSE-NEXT: pandn %xmm0, %xmm1
3437 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3438 ; SSE-NEXT: psrld $16, %xmm0
3439 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3440 ; SSE-NEXT: movdqa %xmm10, %xmm4
3441 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3442 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3443 ; SSE-NEXT: pand %xmm2, %xmm4
3444 ; SSE-NEXT: por %xmm1, %xmm4
3445 ; SSE-NEXT: movdqa %xmm15, %xmm0
3446 ; SSE-NEXT: pandn %xmm4, %xmm0
3447 ; SSE-NEXT: movdqa %xmm11, %xmm1
3448 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3449 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3450 ; SSE-NEXT: pand %xmm11, %xmm4
3451 ; SSE-NEXT: por %xmm1, %xmm4
3452 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
3453 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
3454 ; SSE-NEXT: pand %xmm14, %xmm1
3455 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3456 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3457 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3458 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
3459 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3460 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
3461 ; SSE-NEXT: pandn %xmm4, %xmm14
3462 ; SSE-NEXT: por %xmm1, %xmm14
3463 ; SSE-NEXT: pand %xmm15, %xmm14
3464 ; SSE-NEXT: por %xmm0, %xmm14
3465 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3466 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3467 ; SSE-NEXT: # xmm0 = mem[0,1,0,1]
3468 ; SSE-NEXT: movdqa %xmm2, %xmm1
3469 ; SSE-NEXT: pandn %xmm0, %xmm1
3470 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3471 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
3472 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3473 ; SSE-NEXT: pand %xmm2, %xmm0
3474 ; SSE-NEXT: por %xmm1, %xmm0
3475 ; SSE-NEXT: movdqa %xmm15, %xmm1
3476 ; SSE-NEXT: pandn %xmm0, %xmm1
3477 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3]
3478 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7]
3479 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3480 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
3481 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1]
3482 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3483 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3]
3484 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3]
3485 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3486 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3]
3487 ; SSE-NEXT: andps %xmm15, %xmm8
3488 ; SSE-NEXT: orps %xmm1, %xmm8
3489 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3490 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
3491 ; SSE-NEXT: movdqa %xmm2, %xmm4
3492 ; SSE-NEXT: pandn %xmm1, %xmm4
3493 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3494 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3495 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
3496 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3497 ; SSE-NEXT: pand %xmm2, %xmm1
3498 ; SSE-NEXT: por %xmm4, %xmm1
3499 ; SSE-NEXT: movdqa %xmm15, %xmm4
3500 ; SSE-NEXT: pandn %xmm1, %xmm4
3501 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3502 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
3503 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7]
3504 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3505 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
3506 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1]
3507 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3508 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3509 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3510 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
3511 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
3512 ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3]
3513 ; SSE-NEXT: andps %xmm15, %xmm14
3514 ; SSE-NEXT: orps %xmm4, %xmm14
3515 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3516 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3517 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
3518 ; SSE-NEXT: movdqa %xmm2, %xmm5
3519 ; SSE-NEXT: pandn %xmm4, %xmm5
3520 ; SSE-NEXT: movdqa %xmm7, %xmm4
3521 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3522 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
3523 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
3524 ; SSE-NEXT: pand %xmm2, %xmm4
3525 ; SSE-NEXT: por %xmm5, %xmm4
3526 ; SSE-NEXT: movdqa %xmm15, %xmm7
3527 ; SSE-NEXT: pandn %xmm4, %xmm7
3528 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
3529 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7]
3530 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3531 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
3532 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
3533 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3534 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3535 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
3536 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3]
3537 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3538 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
3539 ; SSE-NEXT: andps %xmm15, %xmm3
3540 ; SSE-NEXT: orps %xmm7, %xmm3
3541 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3542 ; SSE-NEXT: movdqa %xmm10, %xmm4
3543 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3544 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
3545 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
3546 ; SSE-NEXT: pand %xmm2, %xmm4
3547 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3548 ; SSE-NEXT: # xmm7 = mem[0,1,0,1]
3549 ; SSE-NEXT: pandn %xmm7, %xmm2
3550 ; SSE-NEXT: por %xmm4, %xmm2
3551 ; SSE-NEXT: movdqa %xmm15, %xmm4
3552 ; SSE-NEXT: pandn %xmm2, %xmm4
3553 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3554 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
3555 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
3556 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3557 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3]
3558 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
3559 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3560 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3561 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
3562 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3]
3563 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
3564 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3]
3565 ; SSE-NEXT: andps %xmm15, %xmm0
3566 ; SSE-NEXT: orps %xmm4, %xmm0
3567 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3568 ; SSE-NEXT: movdqa %xmm11, %xmm4
3569 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3570 ; SSE-NEXT: movdqa %xmm9, %xmm7
3571 ; SSE-NEXT: pand %xmm11, %xmm7
3572 ; SSE-NEXT: por %xmm4, %xmm7
3573 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7]
3574 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
3575 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7]
3576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3577 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3]
3578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3579 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3580 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3581 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3582 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7]
3583 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3584 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
3585 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
3586 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
3587 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
3588 ; SSE-NEXT: movdqa %xmm15, %xmm9
3589 ; SSE-NEXT: pandn %xmm0, %xmm9
3590 ; SSE-NEXT: andps %xmm15, %xmm4
3591 ; SSE-NEXT: por %xmm4, %xmm9
3592 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3593 ; SSE-NEXT: movdqa %xmm11, %xmm0
3594 ; SSE-NEXT: pandn %xmm8, %xmm0
3595 ; SSE-NEXT: movdqa %xmm12, %xmm4
3596 ; SSE-NEXT: pand %xmm11, %xmm4
3597 ; SSE-NEXT: por %xmm0, %xmm4
3598 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7]
3599 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3600 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
3601 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3602 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3603 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3604 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
3605 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3606 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3607 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3608 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
3609 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3610 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3611 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3612 ; SSE-NEXT: movdqa %xmm15, %xmm4
3613 ; SSE-NEXT: pandn %xmm1, %xmm4
3614 ; SSE-NEXT: andps %xmm15, %xmm0
3615 ; SSE-NEXT: por %xmm0, %xmm4
3616 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3617 ; SSE-NEXT: movdqa %xmm11, %xmm0
3618 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3619 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3620 ; SSE-NEXT: movdqa %xmm12, %xmm1
3621 ; SSE-NEXT: pand %xmm11, %xmm1
3622 ; SSE-NEXT: por %xmm0, %xmm1
3623 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
3624 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3625 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7]
3626 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3627 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3628 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
3629 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3630 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3631 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3632 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3633 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3634 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3635 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3636 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3637 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3638 ; SSE-NEXT: movdqa %xmm15, %xmm4
3639 ; SSE-NEXT: pandn %xmm1, %xmm4
3640 ; SSE-NEXT: andps %xmm15, %xmm0
3641 ; SSE-NEXT: por %xmm0, %xmm4
3642 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3643 ; SSE-NEXT: movdqa %xmm11, %xmm0
3644 ; SSE-NEXT: pandn %xmm3, %xmm0
3645 ; SSE-NEXT: movdqa %xmm14, %xmm1
3646 ; SSE-NEXT: pand %xmm11, %xmm1
3647 ; SSE-NEXT: por %xmm0, %xmm1
3648 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
3649 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3650 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7]
3651 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3652 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
3653 ; SSE-NEXT: movdqa %xmm10, %xmm2
3654 ; SSE-NEXT: movdqa %xmm10, %xmm1
3655 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3656 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3657 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3658 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
3659 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3660 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3661 ; SSE-NEXT: andps %xmm15, %xmm0
3662 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3663 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
3664 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
3665 ; SSE-NEXT: pandn %xmm1, %xmm15
3666 ; SSE-NEXT: por %xmm0, %xmm15
3667 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3668 ; SSE-NEXT: movdqa %xmm9, %xmm0
3669 ; SSE-NEXT: psrld $16, %xmm0
3670 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3671 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3672 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3673 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
3674 ; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
3675 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3676 ; SSE-NEXT: psrlq $48, %xmm1
3677 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3678 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3679 ; SSE-NEXT: movdqa %xmm4, %xmm1
3680 ; SSE-NEXT: psrlq $16, %xmm1
3681 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3682 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
3683 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3684 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3685 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3686 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3687 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3688 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
3689 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
3690 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3691 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3692 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3693 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3694 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3695 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3696 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3697 ; SSE-NEXT: movdqa %xmm1, %xmm0
3698 ; SSE-NEXT: psrld $16, %xmm0
3699 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3700 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
3701 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3702 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3703 ; SSE-NEXT: movdqa %xmm1, %xmm6
3704 ; SSE-NEXT: movdqa %xmm7, %xmm1
3705 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
3706 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3707 ; SSE-NEXT: psrlq $48, %xmm1
3708 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3709 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3710 ; SSE-NEXT: movdqa %xmm14, %xmm1
3711 ; SSE-NEXT: psrlq $16, %xmm1
3712 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3713 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
3714 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3715 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3716 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3717 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3718 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3719 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
3720 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
3721 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3722 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3723 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3724 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3725 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3726 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3727 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3728 ; SSE-NEXT: movdqa %xmm1, %xmm7
3729 ; SSE-NEXT: psrld $16, %xmm7
3730 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3731 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
3732 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3733 ; SSE-NEXT: movdqa %xmm1, %xmm6
3734 ; SSE-NEXT: movdqa %xmm8, %xmm1
3735 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
3736 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3737 ; SSE-NEXT: psrlq $48, %xmm1
3738 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3739 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3740 ; SSE-NEXT: movdqa %xmm13, %xmm1
3741 ; SSE-NEXT: psrlq $16, %xmm1
3742 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
3743 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3744 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3745 ; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload
3746 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
3747 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3748 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3749 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
3750 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3751 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3752 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3753 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3754 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3755 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3756 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3757 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3758 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3759 ; SSE-NEXT: movdqa %xmm1, %xmm6
3760 ; SSE-NEXT: psrld $16, %xmm6
3761 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3762 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
3763 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
3764 ; SSE-NEXT: movdqa %xmm1, %xmm12
3765 ; SSE-NEXT: movdqa %xmm9, %xmm1
3766 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
3767 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3768 ; SSE-NEXT: psrlq $48, %xmm1
3769 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3770 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3771 ; SSE-NEXT: psrlq $16, %xmm1
3772 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3773 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3774 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
3775 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3776 ; SSE-NEXT: movdqa %xmm2, %xmm12
3777 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
3778 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3779 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3780 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3781 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3782 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
3783 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
3784 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3785 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3786 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3787 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3788 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
3789 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
3790 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3791 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3792 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
3793 ; SSE-NEXT: movdqa %xmm11, %xmm0
3794 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3795 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3796 ; SSE-NEXT: pand %xmm11, %xmm1
3797 ; SSE-NEXT: por %xmm0, %xmm1
3798 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
3799 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3800 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3801 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3802 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3803 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
3804 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3805 ; SSE-NEXT: movdqa %xmm14, %xmm0
3806 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
3807 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3808 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3809 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3810 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3811 ; SSE-NEXT: movdqa %xmm11, %xmm0
3812 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3813 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3814 ; SSE-NEXT: pand %xmm11, %xmm1
3815 ; SSE-NEXT: por %xmm0, %xmm1
3816 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
3817 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3818 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3819 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
3820 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3821 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2]
3822 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3823 ; SSE-NEXT: movdqa %xmm13, %xmm0
3824 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3825 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
3826 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3827 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3828 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
3829 ; SSE-NEXT: movdqa %xmm11, %xmm0
3830 ; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload
3831 ; SSE-NEXT: pand %xmm11, %xmm8
3832 ; SSE-NEXT: por %xmm0, %xmm8
3833 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3]
3834 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3835 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3836 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
3837 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3838 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
3839 ; SSE-NEXT: pand %xmm11, %xmm2
3840 ; SSE-NEXT: pandn %xmm12, %xmm11
3841 ; SSE-NEXT: por %xmm2, %xmm11
3842 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3843 ; SSE-NEXT: movdqa %xmm12, %xmm0
3844 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3845 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
3846 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3847 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
3848 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
3849 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
3850 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
3851 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3852 ; SSE-NEXT: movdqa %xmm3, %xmm11
3853 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3854 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
3855 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
3856 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3857 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3858 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3859 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
3860 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3861 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
3862 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3863 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3864 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3865 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
3866 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3867 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3868 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3869 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
3870 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
3871 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
3872 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3873 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
3874 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3875 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
3876 ; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3877 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
3878 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
3879 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3880 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
3881 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
3882 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3883 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3884 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
3885 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3886 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3887 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
3888 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
3889 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
3890 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3891 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
3892 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
3893 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3894 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3895 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3896 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3897 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
3898 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
3899 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3900 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3901 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
3902 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
3903 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
3904 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
3905 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
3906 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3907 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3908 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
3909 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
3910 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3911 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
3912 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
3913 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3914 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
3915 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
3916 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3917 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3918 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
3919 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3920 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3921 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
3922 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3923 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
3924 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3925 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
3926 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3927 ; SSE-NEXT: movaps %xmm1, (%rsi)
3928 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3929 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
3930 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3931 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
3932 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3933 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
3934 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3935 ; SSE-NEXT: movaps %xmm1, (%rdx)
3936 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3937 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
3938 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3939 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
3940 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3941 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
3942 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3943 ; SSE-NEXT: movaps %xmm1, (%rcx)
3944 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3945 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
3946 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3947 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
3948 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3949 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3950 ; SSE-NEXT: movdqa %xmm15, (%r8)
3951 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3952 ; SSE-NEXT: movaps %xmm1, 48(%r8)
3953 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3954 ; SSE-NEXT: movaps %xmm1, 32(%r8)
3955 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3956 ; SSE-NEXT: movaps %xmm1, 16(%r8)
3957 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3958 ; SSE-NEXT: movaps %xmm1, (%r9)
3959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3960 ; SSE-NEXT: movaps %xmm1, 48(%r9)
3961 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3962 ; SSE-NEXT: movaps %xmm1, 32(%r9)
3963 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3964 ; SSE-NEXT: movaps %xmm1, 16(%r9)
3965 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3966 ; SSE-NEXT: movaps %xmm6, (%rax)
3967 ; SSE-NEXT: movaps %xmm7, 48(%rax)
3968 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3969 ; SSE-NEXT: movaps %xmm1, 32(%rax)
3970 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3971 ; SSE-NEXT: movaps %xmm1, 16(%rax)
3972 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3973 ; SSE-NEXT: movapd %xmm0, (%rax)
3974 ; SSE-NEXT: movapd %xmm3, 48(%rax)
3975 ; SSE-NEXT: movapd %xmm4, 32(%rax)
3976 ; SSE-NEXT: movapd %xmm5, 16(%rax)
3977 ; SSE-NEXT: addq $600, %rsp # imm = 0x258
3980 ; AVX1-ONLY-LABEL: load_i16_stride7_vf32:
3981 ; AVX1-ONLY: # %bb.0:
3982 ; AVX1-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8
3983 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15
3984 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm0
3985 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3986 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm8
3987 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3]
3988 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3989 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3990 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
3991 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3992 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
3993 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
3994 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
3995 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3996 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
3997 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1
3998 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3999 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2
4000 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4001 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4002 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4003 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4004 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4005 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
4006 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4007 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
4008 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4009 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4010 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
4011 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4012 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
4013 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
4014 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3
4015 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3]
4016 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13
4017 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4018 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
4019 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4020 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3
4021 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4022 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0
4023 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4024 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
4025 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
4026 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
4027 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4028 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6
4029 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7
4030 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero
4031 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4032 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4033 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
4034 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4035 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
4036 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
4037 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
4038 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4039 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
4040 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4041 ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1
4042 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4043 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
4044 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2
4045 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4046 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
4047 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4048 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm9
4049 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm9, %xmm2
4050 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4051 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3
4052 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4053 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
4054 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4055 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3
4056 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4057 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2
4058 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4059 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4060 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
4061 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4062 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
4063 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2
4064 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4065 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4066 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4067 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm14
4068 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3]
4069 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4070 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4071 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5
4072 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11
4073 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero
4074 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4075 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4076 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
4077 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4
4078 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4079 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3
4080 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4081 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
4082 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4
4083 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4084 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
4085 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
4086 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4
4087 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4088 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
4089 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4090 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
4091 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
4092 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0
4093 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
4094 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4095 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
4096 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4097 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
4098 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
4099 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4100 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4101 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
4102 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7]
4103 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
4104 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
4105 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4106 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4107 ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1
4108 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4109 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
4110 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4111 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4112 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
4113 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4114 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4115 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4116 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4117 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
4118 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
4119 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2
4120 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
4121 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
4122 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
4123 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
4124 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4
4125 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4126 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
4127 ; AVX1-ONLY-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7]
4128 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
4129 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
4130 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
4131 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4132 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1
4133 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3
4134 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1
4135 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4136 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4137 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
4138 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4139 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4140 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4141 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4142 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4143 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
4144 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
4145 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4146 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4147 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7]
4148 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
4149 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
4150 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4151 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4152 ; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1
4153 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4154 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
4155 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4156 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
4157 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
4158 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4159 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
4160 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
4161 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
4162 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7]
4163 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4164 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4
4165 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4166 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4167 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4168 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4169 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
4170 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
4171 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4172 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm5
4173 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
4174 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4175 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4176 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4
4177 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1
4178 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4179 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4180 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
4181 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4182 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4183 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4184 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4185 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm0
4186 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4187 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4188 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3]
4189 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4190 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7]
4191 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4192 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4193 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4194 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
4195 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4196 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4197 ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2]
4198 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4199 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
4200 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4201 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1]
4202 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7]
4203 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4204 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
4205 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4206 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4207 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,3]
4208 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4209 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4210 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3]
4211 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1]
4212 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4213 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4214 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
4215 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
4216 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
4217 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4218 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
4219 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5
4220 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
4221 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4222 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4223 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1
4224 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4
4225 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1
4226 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4227 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm1
4228 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4229 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4230 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3]
4231 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
4232 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4233 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
4234 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
4235 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4236 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
4237 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
4238 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4239 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
4240 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4241 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4242 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
4243 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1]
4244 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
4245 ; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
4246 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3]
4247 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
4248 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
4249 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2]
4250 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7]
4251 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4252 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1]
4253 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
4254 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4255 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
4256 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
4257 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4258 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5
4259 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm3, %ymm14
4260 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5
4261 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4262 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4
4263 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5
4264 ; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm6
4265 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
4266 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4267 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4268 ; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
4269 ; AVX1-ONLY-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
4270 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
4271 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
4272 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4273 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
4274 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7]
4275 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
4276 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7]
4277 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7]
4278 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4279 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
4280 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
4281 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
4282 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
4283 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4284 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
4285 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4286 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7]
4287 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
4288 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
4289 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7]
4290 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4291 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
4292 ; AVX1-ONLY-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4293 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
4294 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
4295 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
4296 ; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
4297 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
4298 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15
4299 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4300 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm13
4301 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13
4302 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
4303 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0
4304 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13
4305 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
4306 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13
4307 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13
4308 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0
4309 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
4310 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4311 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4312 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload
4313 ; AVX1-ONLY-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
4314 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4315 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
4316 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4317 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload
4318 ; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
4319 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7]
4320 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7]
4321 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7]
4322 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4323 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4324 ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4325 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7]
4326 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
4327 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
4328 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4329 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4330 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
4331 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
4332 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
4333 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7]
4334 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4335 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
4336 ; AVX1-ONLY-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
4337 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
4338 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
4339 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
4340 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5
4341 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm14
4342 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
4343 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1
4344 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm2
4345 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
4346 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2
4347 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4348 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4349 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
4350 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
4351 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4352 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4353 ; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1]
4354 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
4355 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7]
4356 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4357 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3]
4358 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
4359 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4360 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4361 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7]
4362 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
4363 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4364 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
4365 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4366 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1
4367 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4368 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
4369 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4370 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm4
4371 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4372 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
4373 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
4374 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4375 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3]
4376 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4377 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4378 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4379 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
4380 ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4381 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
4382 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
4383 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3]
4384 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
4385 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
4386 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4387 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4388 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4389 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4
4390 ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm13
4391 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
4392 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4393 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
4394 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4395 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
4396 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4397 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
4398 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
4399 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
4400 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
4401 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4402 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4403 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
4404 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4405 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2
4406 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4407 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
4408 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
4409 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4410 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm4
4411 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4412 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4413 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
4414 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4415 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
4416 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
4417 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4418 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4419 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4420 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
4421 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
4422 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4423 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
4424 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
4425 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
4426 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4427 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0
4428 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1
4429 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4430 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4431 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0
4432 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4433 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4434 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4435 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
4436 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
4437 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
4438 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
4439 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4440 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4441 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7]
4442 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4443 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4444 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4445 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1
4446 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4447 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4448 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4449 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
4450 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
4451 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
4452 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4453 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload
4454 ; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7]
4455 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4456 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7]
4457 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4458 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
4459 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4460 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
4461 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4462 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4463 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
4464 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
4465 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4466 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4467 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0
4468 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1
4469 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
4470 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4471 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4472 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
4473 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
4474 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2
4475 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4476 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4477 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4478 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
4479 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4480 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4481 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7]
4482 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4483 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
4484 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
4485 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2
4486 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
4487 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4488 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
4489 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10
4490 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
4491 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
4492 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
4493 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4494 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4495 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7]
4496 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4497 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
4498 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4499 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
4500 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
4501 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
4502 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4503 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4504 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
4505 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
4506 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4507 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4508 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1
4509 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2
4510 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
4511 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4512 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4513 ; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero
4514 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4515 ; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
4516 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4517 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
4518 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7]
4519 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4520 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
4521 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
4522 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4523 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
4524 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4525 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4526 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4527 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4528 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4529 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
4530 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
4531 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
4532 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
4533 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4534 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
4535 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4536 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
4537 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3]
4538 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
4539 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
4540 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4541 ; AVX1-ONLY-NEXT: # xmm12 = mem[3,3,3,3]
4542 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5
4543 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
4544 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4545 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4546 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4547 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4
4548 ; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11
4549 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
4550 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4551 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4552 ; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
4553 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4554 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4555 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4556 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
4557 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
4558 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
4559 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
4560 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
4561 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
4562 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4563 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4564 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4565 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
4566 ; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4567 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
4568 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
4569 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
4570 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
4571 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
4572 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3]
4573 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
4574 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
4575 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3]
4576 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
4577 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
4578 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4579 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4
4580 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm3
4581 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
4582 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4583 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi)
4584 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
4585 ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi)
4586 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4587 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx)
4588 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4589 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx)
4590 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4591 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx)
4592 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4593 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx)
4594 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4595 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
4596 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4597 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8)
4598 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4599 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9)
4600 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4601 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
4602 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4603 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax)
4604 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4605 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax)
4606 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4607 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax)
4608 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax)
4609 ; AVX1-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8
4610 ; AVX1-ONLY-NEXT: vzeroupper
4611 ; AVX1-ONLY-NEXT: retq
4613 ; AVX2-SLOW-LABEL: load_i16_stride7_vf32:
4614 ; AVX2-SLOW: # %bb.0:
4615 ; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208
4616 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10
4617 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9
4618 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5
4619 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13
4620 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
4621 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
4622 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11
4623 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
4624 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
4625 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8
4626 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
4627 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
4628 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
4629 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
4630 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6
4631 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7
4632 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
4633 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
4634 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
4635 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3
4636 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
4637 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1
4638 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4639 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3
4640 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
4641 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5
4642 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
4643 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
4644 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
4645 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
4646 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
4647 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
4648 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4649 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
4650 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
4651 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
4652 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
4653 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4654 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
4655 ; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1
4656 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
4657 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
4658 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
4659 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
4660 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
4661 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4662 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
4663 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
4664 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
4665 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4666 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
4667 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
4668 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
4669 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
4670 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
4671 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4672 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1]
4673 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
4674 ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4675 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4676 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
4677 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
4678 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4679 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
4680 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
4681 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
4682 ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0
4683 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
4684 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
4685 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13
4686 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7]
4687 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4
4688 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
4689 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1
4690 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4691 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4692 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
4693 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
4694 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3
4695 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4696 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
4697 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
4698 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14
4699 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4700 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
4701 ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2
4702 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10
4703 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
4704 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15
4705 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4706 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4707 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
4708 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
4709 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4710 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
4711 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
4712 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
4713 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4714 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9
4715 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
4716 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4717 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
4718 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
4719 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
4720 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
4721 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
4722 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4723 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8
4724 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7
4725 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2
4726 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7]
4727 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11
4728 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
4729 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
4730 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4731 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
4732 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4733 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4734 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5
4735 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2]
4736 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5]
4737 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4738 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
4739 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
4740 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
4741 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4742 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6
4743 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4
4744 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
4745 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7]
4746 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
4747 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4748 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
4749 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4750 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4751 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3
4752 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2]
4753 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5]
4754 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4755 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
4756 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4757 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
4758 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4759 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7]
4760 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14
4761 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7]
4762 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
4763 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1
4764 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4765 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
4766 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
4767 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4768 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4769 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4770 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4771 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4772 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7]
4773 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4774 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
4775 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0
4776 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4777 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
4778 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
4779 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4780 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4781 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
4782 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4783 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4784 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
4785 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4786 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
4787 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
4788 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4789 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4790 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2]
4791 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
4792 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
4793 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
4794 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
4795 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
4796 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4797 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
4798 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4799 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
4800 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4801 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4802 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,2]
4803 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
4804 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
4805 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4806 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
4807 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
4808 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4809 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
4810 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4811 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
4812 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
4813 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4814 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4815 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3]
4816 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4]
4817 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
4818 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
4819 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
4820 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
4821 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4822 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
4823 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
4824 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
4825 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
4826 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4827 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
4828 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3]
4829 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
4830 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
4831 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
4832 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
4833 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
4834 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4835 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7]
4836 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
4837 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4838 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
4839 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
4840 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
4841 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3
4842 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4843 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5]
4844 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4845 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
4846 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
4847 ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
4848 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5
4849 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
4850 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
4851 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
4852 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
4853 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4854 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
4855 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7]
4856 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
4857 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
4858 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
4859 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
4860 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15]
4861 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
4862 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4863 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4864 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
4865 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4866 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
4867 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
4868 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4869 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
4870 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4871 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4872 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4873 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4874 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
4875 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5
4876 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
4877 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
4878 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
4879 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
4880 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7
4881 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
4882 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
4883 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4884 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
4885 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
4886 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4887 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
4888 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4889 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4890 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
4891 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4892 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
4893 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
4894 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
4895 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
4896 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
4897 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
4898 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
4899 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3
4900 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0
4901 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
4902 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
4903 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
4904 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4905 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
4906 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7]
4907 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
4908 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
4909 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
4910 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
4911 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
4912 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
4913 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
4914 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1
4915 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
4916 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
4917 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4918 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4919 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
4920 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15
4921 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7
4922 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
4923 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
4924 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
4925 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11
4926 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1
4927 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
4928 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
4929 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6]
4930 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
4931 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15]
4932 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4933 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7]
4934 ; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13
4935 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
4936 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
4937 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8
4938 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4939 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4940 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
4941 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14
4942 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
4943 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
4944 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
4945 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4946 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
4947 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15]
4948 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
4949 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
4950 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
4951 ; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7]
4952 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
4953 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15]
4954 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
4955 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6
4956 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
4957 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
4958 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
4959 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
4960 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
4961 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4962 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4963 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4964 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
4965 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
4966 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4967 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
4968 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
4969 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
4970 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4971 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
4972 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
4973 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
4974 ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8
4975 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
4976 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
4977 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4978 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
4979 ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
4980 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8
4981 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
4982 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
4983 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7]
4984 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4985 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4986 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4987 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4988 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
4989 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7]
4990 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
4991 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
4992 ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3
4993 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
4994 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
4995 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
4996 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4997 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
4998 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
4999 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
5000 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5001 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
5002 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5003 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi)
5004 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5005 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi)
5006 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5007 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx)
5008 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5009 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx)
5010 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5011 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx)
5012 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5013 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx)
5014 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5015 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8)
5016 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5017 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8)
5018 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5019 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9)
5020 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5021 ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9)
5022 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5023 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5024 ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax)
5025 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax)
5026 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
5027 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax)
5028 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax)
5029 ; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208
5030 ; AVX2-SLOW-NEXT: vzeroupper
5031 ; AVX2-SLOW-NEXT: retq
5033 ; AVX2-FAST-LABEL: load_i16_stride7_vf32:
5034 ; AVX2-FAST: # %bb.0:
5035 ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288
5036 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7
5037 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8
5038 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5
5039 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6
5040 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
5041 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
5042 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
5043 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2
5044 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
5045 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13
5046 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm14
5047 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5048 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
5049 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2
5050 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
5051 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11
5052 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12
5053 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
5054 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
5055 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
5056 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4
5057 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
5058 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15
5059 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
5060 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9
5061 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10
5062 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
5063 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
5064 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
5065 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6
5066 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
5067 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
5068 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
5069 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
5070 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5071 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
5072 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u>
5073 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
5074 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
5075 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
5076 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
5077 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
5078 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
5079 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
5080 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5081 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
5082 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5083 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7
5084 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
5085 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
5086 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
5087 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5088 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
5089 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2
5090 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5091 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
5092 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5093 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
5094 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5095 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5096 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
5097 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5098 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5099 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
5100 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
5101 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u>
5102 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1
5103 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
5104 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1
5105 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
5106 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5107 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9
5108 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
5109 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5110 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5111 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
5112 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
5113 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5114 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5115 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5
5116 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5117 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
5118 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
5119 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5120 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6
5121 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
5122 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
5123 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
5124 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
5125 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5126 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
5127 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5128 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
5129 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
5130 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
5131 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5132 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
5133 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
5134 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
5135 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1
5136 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
5137 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5138 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
5139 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8
5140 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0
5141 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1
5142 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
5143 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5
5144 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7
5145 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5]
5146 ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1]
5147 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2
5148 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
5149 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12
5150 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4
5151 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2]
5152 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5153 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11
5154 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
5155 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
5156 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7]
5157 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5158 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2
5159 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3
5160 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
5161 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12
5162 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11
5163 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13
5164 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1
5165 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2
5166 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2]
5167 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3
5168 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15
5169 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
5170 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5171 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
5172 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5173 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10
5174 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
5175 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15
5176 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7]
5177 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
5178 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1
5179 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5180 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
5181 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5182 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5183 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5184 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5185 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5186 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5187 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7]
5188 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5189 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
5190 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0
5191 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1
5192 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5193 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5194 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5195 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5196 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5197 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5198 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
5199 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7
5200 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5201 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
5202 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
5203 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5204 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5205 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
5206 ; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
5207 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14
5208 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5209 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
5210 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
5211 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5212 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5213 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
5214 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14
5215 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9
5216 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7]
5217 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5218 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5219 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5220 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1
5221 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5222 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5223 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
5224 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5225 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5226 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
5227 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5228 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
5229 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
5230 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5231 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5232 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3]
5233 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
5234 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9
5235 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
5236 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5237 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
5238 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5239 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5240 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
5241 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12
5242 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4
5243 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
5244 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5245 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5246 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3]
5247 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1
5248 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5249 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
5250 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
5251 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5252 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5253 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5254 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
5255 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5256 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
5257 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5258 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5259 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7]
5260 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5261 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5262 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
5263 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7]
5264 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5265 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5266 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
5267 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
5268 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5269 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5270 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7]
5271 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u>
5272 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6
5273 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
5274 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6
5275 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
5276 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
5277 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5278 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5279 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5280 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5281 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
5282 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5283 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
5284 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5285 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5286 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5287 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5288 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
5289 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2
5290 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
5291 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9
5292 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
5293 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5294 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
5295 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5296 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
5297 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2
5298 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
5299 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
5300 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5301 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5302 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
5303 ; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
5304 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
5305 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
5306 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4]
5307 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
5308 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
5309 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
5310 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
5311 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
5312 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15
5313 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5314 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15
5315 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5316 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
5317 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
5318 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9
5319 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7
5320 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u>
5321 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15
5322 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
5323 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15
5324 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15]
5325 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3]
5326 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
5327 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5328 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5329 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
5330 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1
5331 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
5332 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11
5333 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12
5334 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
5335 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8
5336 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
5337 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15]
5338 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
5339 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2
5340 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
5341 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5342 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5343 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
5344 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
5345 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
5346 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5347 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
5348 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
5349 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
5350 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5351 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5352 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5353 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5354 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
5355 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
5356 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
5357 ; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload
5358 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
5359 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3
5360 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5]
5361 ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
5362 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0
5363 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
5364 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
5365 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
5366 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
5367 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5368 ; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
5369 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
5370 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
5371 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
5372 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9
5373 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
5374 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
5375 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u>
5376 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3
5377 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
5378 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3
5379 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
5380 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
5381 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5382 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2
5383 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2
5384 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
5385 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3
5386 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
5387 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
5388 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
5389 ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7]
5390 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3
5391 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3
5392 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
5393 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
5394 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5
5395 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
5396 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
5397 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
5398 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5399 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5400 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5401 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi)
5402 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5403 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi)
5404 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5405 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx)
5406 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5407 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx)
5408 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5409 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx)
5410 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5411 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx)
5412 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5413 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8)
5414 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5415 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8)
5416 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5417 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9)
5418 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5419 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9)
5420 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
5421 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5422 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax)
5423 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5424 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax)
5425 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
5426 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax)
5427 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax)
5428 ; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288
5429 ; AVX2-FAST-NEXT: vzeroupper
5430 ; AVX2-FAST-NEXT: retq
5432 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32:
5433 ; AVX2-FAST-PERLANE: # %bb.0:
5434 ; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228
5435 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11
5436 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10
5437 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5
5438 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12
5439 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
5440 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
5441 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6
5442 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1
5443 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
5444 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9
5445 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
5446 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
5447 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2
5448 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
5449 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7
5450 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8
5451 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
5452 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
5453 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
5454 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3
5455 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
5456 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1
5457 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5458 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1
5459 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7]
5460 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5
5461 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
5462 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5463 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
5464 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
5465 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0
5466 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5467 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5468 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
5469 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5470 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
5471 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
5472 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5473 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7]
5474 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
5475 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
5476 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
5477 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5478 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0
5479 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5480 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
5481 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
5482 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
5483 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5484 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
5485 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
5486 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
5487 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5488 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5489 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5490 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1]
5491 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
5492 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5493 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5494 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
5495 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
5496 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3
5498 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
5499 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
5500 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
5501 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0
5502 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
5503 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5504 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
5505 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5506 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7]
5507 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5508 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5509 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5510 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5511 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5512 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13
5513 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5514 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
5515 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1]
5516 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5517 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
5518 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5519 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7
5520 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
5521 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15
5522 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5523 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5524 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5525 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
5526 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5527 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
5528 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5529 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
5530 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5531 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10
5532 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
5533 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5534 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5535 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5536 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
5537 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5538 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5539 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8
5540 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6
5541 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2
5542 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7]
5543 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11
5544 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
5545 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
5546 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1
5547 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
5548 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5549 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
5550 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5
5551 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2]
5552 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5553 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4
5554 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5555 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
5556 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
5557 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5558 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9
5559 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4
5560 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
5561 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14
5562 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3
5563 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
5564 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
5565 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
5566 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1
5567 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2]
5568 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13
5569 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
5570 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
5571 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7]
5572 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5573 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7]
5574 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13
5575 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7]
5576 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
5577 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3
5578 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5579 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
5580 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14
5581 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
5582 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5583 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15]
5584 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7]
5585 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5586 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
5587 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14
5588 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
5589 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3
5590 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5591 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
5592 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5593 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5594 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
5595 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5596 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5597 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
5598 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5599 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
5600 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
5601 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5602 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5603 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2]
5604 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
5605 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3
5606 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
5607 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5608 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
5609 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5610 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5611 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7]
5612 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3
5613 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
5614 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5615 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2]
5616 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2
5617 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5618 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
5619 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
5620 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
5621 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5622 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7]
5623 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
5624 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5625 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
5626 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5627 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5628 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3]
5629 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
5630 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5
5631 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
5632 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
5633 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5634 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5635 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7]
5636 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5
5637 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
5638 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5639 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
5640 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3]
5641 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1
5642 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5643 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
5644 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
5645 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5646 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
5647 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
5648 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5649 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3
5650 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14
5651 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
5652 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5653 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5654 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5655 ; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
5656 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7]
5657 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
5658 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8
5659 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
5660 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15]
5661 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5662 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7]
5663 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5664 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5665 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10
5666 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
5667 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5668 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3
5669 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5670 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
5671 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7
5672 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
5673 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15]
5674 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5675 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5676 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5677 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5678 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
5679 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3
5680 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
5681 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5682 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5683 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5684 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5685 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7]
5686 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5
5687 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
5688 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
5689 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5690 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
5691 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7
5692 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
5693 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3
5694 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0
5695 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5696 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5697 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
5698 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5699 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5700 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7]
5701 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5702 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15]
5703 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
5704 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3
5705 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7]
5706 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
5707 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5708 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
5709 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3
5710 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0
5711 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
5712 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
5713 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10
5714 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5715 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
5716 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
5717 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14
5718 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
5719 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
5720 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5721 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
5722 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
5723 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2
5724 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
5725 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3]
5726 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
5727 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5728 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7]
5729 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10
5730 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
5731 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10
5732 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15
5733 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12
5734 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
5735 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5
5736 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5737 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
5738 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15]
5739 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5740 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5741 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
5742 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
5743 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15]
5744 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10
5745 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5746 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5747 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
5748 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14
5749 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14
5750 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
5751 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
5752 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
5753 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
5754 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5755 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5756 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
5757 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7]
5758 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
5759 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15]
5760 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
5761 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6
5762 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
5763 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
5764 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
5765 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
5766 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5767 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
5768 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
5769 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5770 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
5771 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5772 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5773 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
5774 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10
5775 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
5776 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10
5777 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
5778 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
5779 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
5780 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5
5781 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
5782 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
5783 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5784 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5785 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5786 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
5787 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
5788 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
5789 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3
5790 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
5791 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4
5792 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5793 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5794 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
5795 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7]
5796 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
5797 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
5798 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4
5799 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7]
5800 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
5801 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6
5802 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
5803 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
5804 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
5805 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5806 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5807 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5808 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi)
5809 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
5810 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi)
5811 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5812 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx)
5813 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5814 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx)
5815 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5816 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx)
5817 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5818 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx)
5819 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5820 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8)
5821 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5822 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8)
5823 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5824 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9)
5825 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5826 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9)
5827 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5828 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5829 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax)
5830 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax)
5831 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5832 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax)
5833 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax)
5834 ; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228
5835 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
5836 ; AVX2-FAST-PERLANE-NEXT: retq
5838 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf32:
5839 ; AVX512F-ONLY-SLOW: # %bb.0:
5840 ; AVX512F-ONLY-SLOW-NEXT: pushq %rax
5841 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2
5842 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
5843 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12
5844 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4
5845 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7]
5846 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5847 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5848 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
5849 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13
5850 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5851 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
5852 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
5853 ; AVX512F-ONLY-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21
5854 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
5855 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10
5856 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
5857 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
5858 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
5859 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5860 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5861 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5862 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
5863 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
5864 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
5865 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,1,4,5,6,5]
5866 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5867 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
5868 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm5
5869 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7
5870 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,3]
5871 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm14
5872 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
5873 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
5874 ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
5875 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
5876 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm21 {%k1}
5877 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5
5878 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6
5879 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
5880 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27
5881 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
5882 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4],xmm2[5],xmm7[6,7]
5883 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15
5884 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
5885 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7]
5886 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6,7,8,9,10],ymm2[11],ymm8[12,13,14,15]
5887 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5888 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
5889 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11
5890 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7]
5891 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5892 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm8, %ymm2
5893 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5894 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
5895 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
5896 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7]
5897 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
5898 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5899 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
5900 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
5901 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5902 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
5903 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5904 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
5905 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
5906 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7]
5907 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5908 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
5909 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5910 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7]
5911 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
5912 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7]
5913 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5914 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1
5915 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5916 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
5917 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5918 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
5919 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
5920 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5921 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
5922 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
5923 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5924 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5925 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
5926 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm17
5927 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,1,2,3]
5928 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29
5929 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
5930 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24
5931 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
5932 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
5933 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5934 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
5935 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm25
5936 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26
5937 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
5938 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7]
5939 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5940 ; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm20
5941 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3]
5942 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
5943 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
5944 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
5945 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
5946 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5947 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5]
5948 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5949 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5950 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1
5951 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2
5952 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
5953 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm19
5954 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
5955 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
5956 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
5957 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
5958 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5959 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13
5960 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1
5961 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm13[6],xmm1[7]
5962 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
5963 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
5964 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5965 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
5966 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm14, %xmm2
5967 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
5968 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm16
5969 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18
5970 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15
5971 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3
5972 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
5973 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1]
5974 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7,8,9,10,11,12,13],ymm8[14],ymm2[15]
5975 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,1]
5976 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
5977 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
5978 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2],xmm7[3,4,5,6],xmm2[7]
5979 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
5980 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7
5981 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8
5982 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
5983 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
5984 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7]
5985 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5986 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
5987 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
5988 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5989 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5990 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
5991 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5
5992 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
5993 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
5994 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
5995 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7]
5996 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
5997 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2,3,4,5,6],ymm11[7,8],ymm2[9,10,11,12,13,14],ymm11[15]
5998 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
5999 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
6000 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6001 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6002 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
6003 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11
6004 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
6005 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
6006 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6007 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
6008 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3]
6009 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6010 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6011 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28
6012 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7]
6013 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6014 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
6015 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
6016 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
6017 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11
6018 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm11[4],xmm2[5],xmm11[6],xmm2[7]
6019 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u]
6020 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6021 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
6022 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
6023 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
6024 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
6025 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
6026 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
6027 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6028 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
6029 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31
6030 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
6031 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6032 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6033 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
6034 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
6035 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6036 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4]
6037 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
6038 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
6039 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2
6040 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
6041 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
6042 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm27
6043 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
6044 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6045 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
6046 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
6047 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14
6048 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7]
6049 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6050 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
6051 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0]
6052 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6053 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0
6054 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
6055 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
6056 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6057 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15]
6058 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6059 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29
6060 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
6061 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9
6062 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
6063 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3,4,5,6,7]
6064 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
6065 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6066 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
6067 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
6068 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6069 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6070 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1
6071 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2
6072 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
6073 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
6074 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
6075 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23
6076 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm4[2],ymm12[3,4,5],ymm4[6],ymm12[7]
6077 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6078 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
6079 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
6080 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2
6081 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11
6082 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7]
6083 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10
6084 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
6085 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
6086 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
6087 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
6088 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6089 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
6090 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10
6091 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
6092 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7]
6093 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1]
6094 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7,8,9,10],ymm13[11],ymm10[12,13,14,15]
6095 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
6096 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
6097 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
6098 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6099 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6100 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6101 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
6102 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6103 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm13, %ymm10
6104 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
6105 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
6106 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10
6107 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7]
6108 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7]
6109 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1]
6110 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4],ymm10[5,6,7,8,9,10,11],ymm14[12],ymm10[13,14,15]
6111 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
6112 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9
6113 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7]
6114 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6115 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6116 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6117 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
6118 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6119 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm9, %ymm9
6120 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6121 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
6122 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
6123 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6],ymm10[7,8],ymm9[9,10,11,12,13,14],ymm10[15]
6124 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
6125 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm14
6126 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
6127 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
6128 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
6129 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
6130 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
6131 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
6132 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3,4,5,6,7]
6133 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6,7]
6134 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1]
6135 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6,7,8],ymm9[9],ymm4[10,11,12,13,14,15]
6136 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm9
6137 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm12
6138 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm14
6139 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm25
6140 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13
6141 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26
6142 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
6143 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,4,6,7]
6144 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
6145 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6146 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
6147 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6148 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
6149 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7]
6150 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
6151 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
6152 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
6153 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
6154 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
6155 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
6156 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
6157 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
6158 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6159 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7]
6160 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1]
6161 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6162 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7]
6163 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6164 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6165 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
6166 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6167 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
6168 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6169 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm3, %zmm9
6170 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
6171 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm22
6172 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm12
6173 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6174 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm24 # 64-byte Folded Reload
6175 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
6176 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
6177 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k1}
6178 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm3, %zmm27
6179 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1}
6180 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm1
6181 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1}
6182 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rsi)
6183 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx)
6184 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx)
6185 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8)
6186 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9)
6187 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6188 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm3, %zmm10
6189 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm10 {%k1}
6190 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax)
6191 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6192 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm3, %zmm2
6193 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
6194 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax)
6195 ; AVX512F-ONLY-SLOW-NEXT: popq %rax
6196 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper
6197 ; AVX512F-ONLY-SLOW-NEXT: retq
6199 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32:
6200 ; AVX512F-ONLY-FAST: # %bb.0:
6201 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28
6202 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1
6203 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13]
6204 ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1]
6205 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23
6206 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
6207 ; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3]
6208 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15]
6209 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13]
6210 ; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
6211 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18
6212 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,u,u,u,5,8,12,15>
6213 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u>
6214 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm14
6215 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm2, %zmm13
6216 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15>
6217 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm2
6218 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u>
6219 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10
6220 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm0
6221 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14>
6222 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm6
6223 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
6224 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3
6225 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm4
6226 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25
6227 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2]
6228 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6229 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5
6230 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30
6231 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
6232 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7]
6233 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6234 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3
6235 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
6236 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
6237 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6238 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
6239 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
6240 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm11, %ymm22
6241 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7
6242 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm12
6243 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
6244 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
6245 ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0
6246 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
6247 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm22 {%k1}
6248 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5
6249 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm7
6250 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
6251 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm15
6252 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4],xmm11[5],xmm15[6,7]
6253 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
6254 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
6255 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7]
6256 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm6[4,5,6,7]
6257 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
6258 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0
6259 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
6260 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29
6261 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6262 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
6263 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
6264 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
6265 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6266 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0
6267 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6268 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
6269 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10
6270 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
6271 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15
6272 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7]
6273 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15
6274 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6275 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6276 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
6277 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
6278 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
6279 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u]
6280 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
6281 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
6282 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8
6283 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
6284 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6285 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
6286 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6287 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6288 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
6289 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6290 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20
6291 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6292 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
6293 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8
6294 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7]
6295 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6296 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31
6297 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
6298 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
6299 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
6300 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
6301 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6302 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
6303 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2
6304 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6305 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
6306 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
6307 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6308 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm14
6309 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7]
6310 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8
6311 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7]
6312 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm26, %zmm8
6313 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u]
6314 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
6315 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0
6316 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm27, %zmm8
6317 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6
6318 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
6319 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27
6320 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
6321 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6
6322 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7]
6323 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm19, %zmm6
6324 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6325 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6326 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm0, %ymm21
6327 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
6328 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6
6329 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
6330 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
6331 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
6332 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3]
6333 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
6334 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
6335 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
6336 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm19
6337 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2
6338 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8
6339 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm2[2],ymm8[3,4,5],ymm2[6],ymm8[7]
6340 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9
6341 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7]
6342 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
6343 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13
6344 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3,4,5],xmm9[6],xmm13[7]
6345 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9]
6346 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
6347 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6348 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13
6349 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6350 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15]
6351 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6352 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9
6353 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
6354 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17
6355 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
6356 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
6357 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7]
6358 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6359 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6360 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6
6361 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0
6362 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
6363 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9
6364 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm10
6365 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
6366 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm16, %zmm10
6367 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,7,10,14,u,u,u>
6368 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9
6369 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
6370 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,3,u,0,3,7,u>
6371 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm13
6372 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
6373 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
6374 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7],ymm10[8,9,10,11,12],ymm13[13,14,15]
6375 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm13
6376 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
6377 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm26
6378 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
6379 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10
6380 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
6381 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
6382 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
6383 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm10
6384 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
6385 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10
6386 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18
6387 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7]
6388 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6389 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm9
6390 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7]
6391 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
6392 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
6393 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7]
6394 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6
6395 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7]
6396 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6397 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6398 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,6,9,13,u>
6399 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm11, %zmm11
6400 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6401 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
6402 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6403 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6
6404 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6405 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
6406 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0
6407 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10
6408 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm16
6409 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13
6410 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27
6411 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm29
6412 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13]
6413 ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1]
6414 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm0
6415 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,4,7,11,14,u,u,u>
6416 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm17
6417 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm23, %zmm6
6418 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
6419 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6420 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
6421 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm6[2,3,4,5,6,7]
6422 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7]
6423 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10
6424 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7]
6425 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0]
6426 ; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1]
6427 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm10, %ymm10
6428 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
6429 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
6430 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15]
6431 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
6432 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6433 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,u,6,10,13,u>
6434 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12
6435 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
6436 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10
6437 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4,5,6,7]
6438 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10
6439 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6440 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6441 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6442 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7],ymm10[8,9,10],ymm6[11,12,13,14,15]
6443 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6444 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0
6445 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
6446 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u>
6447 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10
6448 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm6, %zmm0
6449 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1
6450 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
6451 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
6452 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
6453 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
6454 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
6455 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6456 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
6457 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7]
6458 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
6459 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
6460 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3]
6461 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
6462 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
6463 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
6464 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
6465 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6466 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2
6467 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6468 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
6469 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6470 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6471 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
6472 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6473 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
6474 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
6475 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6476 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm16
6477 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6478 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24
6479 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm13
6480 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm30, %zmm14
6481 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00
6482 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1
6483 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm27, %zmm14 {%k1}
6484 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm19
6485 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1}
6486 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1}
6487 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm30, %zmm11
6488 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1}
6489 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi)
6490 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx)
6491 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx)
6492 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8)
6493 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9)
6494 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax)
6495 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
6496 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm12, %zmm30, %zmm0
6497 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
6498 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
6499 ; AVX512F-ONLY-FAST-NEXT: vzeroupper
6500 ; AVX512F-ONLY-FAST-NEXT: retq
6502 ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf32:
6503 ; AVX512DQ-SLOW: # %bb.0:
6504 ; AVX512DQ-SLOW-NEXT: pushq %rax
6505 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2
6506 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
6507 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1
6508 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4
6509 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7]
6510 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm7
6511 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm12
6512 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6513 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6514 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
6515 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10
6516 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11
6517 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6518 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
6519 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
6520 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm6
6521 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8
6522 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm9
6523 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
6524 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
6525 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
6526 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
6527 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
6528 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6529 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6530 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
6531 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
6532 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5]
6533 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6534 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6535 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3
6536 ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5
6537 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,3]
6538 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
6539 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
6540 ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
6541 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
6542 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm6 {%k1}
6543 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6544 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3
6545 ; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4
6546 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
6547 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6
6548 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25
6549 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6550 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1,2],xmm2[3],xmm3[4],xmm2[5],xmm3[6,7]
6551 ; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15
6552 ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2
6553 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
6554 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
6555 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6556 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
6557 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
6558 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
6559 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6560 ; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
6561 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6562 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7]
6563 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6564 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6565 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6566 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6567 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
6568 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
6569 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6570 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm5[1],xmm15[2,3,4,5,6,7]
6571 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6572 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
6573 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20
6574 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7]
6575 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6576 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
6577 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6578 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
6579 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6580 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6581 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6582 ; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18
6583 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
6584 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
6585 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
6586 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
6587 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6588 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
6589 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
6590 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6591 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6592 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
6593 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
6594 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm13
6595 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
6596 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm24
6597 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7]
6598 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
6599 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6600 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
6601 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26
6602 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6603 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
6604 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6605 ; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm19
6606 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,1,1,3]
6607 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
6608 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6609 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
6610 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6611 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6612 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,1,2,1,4,5,6,5]
6613 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6614 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6615 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm1
6616 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm15, %xmm2
6617 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6618 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21
6619 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
6620 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6621 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
6622 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
6623 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6624 ; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2
6625 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3
6626 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
6627 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17
6628 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29
6629 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
6630 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
6631 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6632 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6633 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm5, %xmm1
6634 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27
6635 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
6636 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28
6637 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23
6638 ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11
6639 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15
6640 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
6641 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6642 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
6643 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,1]
6644 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
6645 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
6646 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7]
6647 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6648 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm14
6649 ; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5
6650 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7]
6651 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6652 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
6653 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6654 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
6655 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6656 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6657 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6658 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30
6659 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4
6660 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
6661 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6662 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
6663 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7]
6664 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6665 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15]
6666 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6667 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
6668 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
6669 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6670 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7]
6671 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6672 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
6673 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
6674 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6675 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
6676 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6677 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6678 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6679 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16
6680 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7]
6681 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6682 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
6683 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
6684 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
6685 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6686 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6687 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u]
6688 ; AVX512DQ-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6689 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
6690 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
6691 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
6692 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6693 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
6694 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
6695 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6696 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
6697 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6698 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm24
6699 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
6700 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
6701 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1}
6702 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
6703 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6704 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6705 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
6706 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
6707 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6708 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,1,2,0,4,5,6,4]
6709 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
6710 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
6711 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,6,7]
6712 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
6713 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm18
6714 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
6715 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31
6716 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6717 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
6718 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7]
6719 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6720 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6721 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
6722 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7]
6723 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0]
6724 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6725 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0
6726 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
6727 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
6728 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6729 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6730 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6731 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm18
6732 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm18 {%k1}
6733 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm1
6734 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm6
6735 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7]
6736 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6737 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
6738 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
6739 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm13
6740 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
6741 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22
6742 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6743 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
6744 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
6745 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
6746 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6747 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
6748 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
6749 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6750 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
6751 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7]
6752 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[0,1,0,1]
6753 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15]
6754 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm7
6755 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7]
6756 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10
6757 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7]
6758 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
6759 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6760 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6761 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6762 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
6763 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3
6764 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6765 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm25, %zmm0
6766 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1}
6767 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7]
6768 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6769 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6770 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7]
6771 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,3,0,1]
6772 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4],ymm3[5,6,7,8,9,10,11],ymm10[12],ymm3[13,14,15]
6773 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7]
6774 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21
6775 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12
6776 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
6777 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
6778 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6779 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6780 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
6781 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
6782 ; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm3
6783 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6784 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7]
6785 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3
6786 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1]
6787 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2,3,4,5,6],ymm10[7,8],ymm2[9,10,11,12,13,14],ymm10[15]
6788 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
6789 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
6790 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
6791 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8
6792 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9
6793 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1
6794 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4
6795 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7]
6796 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
6797 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6798 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
6799 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
6800 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6801 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5,6,7],ymm10[8,9,10,11,12],ymm4[13,14,15]
6802 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1
6803 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10
6804 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
6805 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
6806 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
6807 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm4, %zmm19
6808 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4
6809 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1],ymm13[2,3],ymm4[4,5],ymm13[6,7]
6810 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12
6811 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
6812 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
6813 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
6814 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
6815 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
6816 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
6817 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
6818 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm25, %zmm2
6819 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1}
6820 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7]
6821 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
6822 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15]
6823 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
6824 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7]
6825 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
6826 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
6827 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
6828 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
6829 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
6830 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7]
6831 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
6832 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10
6833 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7]
6834 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1
6835 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4
6836 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7]
6837 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
6838 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
6839 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6840 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
6841 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
6842 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7]
6843 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1]
6844 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
6845 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7]
6846 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
6847 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
6848 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15]
6849 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
6850 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm25, %zmm3
6851 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
6852 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
6853 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload
6854 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
6855 ; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20
6856 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm1, %zmm9
6857 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rsi)
6858 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx)
6859 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx)
6860 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%r8)
6861 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r9)
6862 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6863 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax)
6864 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6865 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax)
6866 ; AVX512DQ-SLOW-NEXT: popq %rax
6867 ; AVX512DQ-SLOW-NEXT: vzeroupper
6868 ; AVX512DQ-SLOW-NEXT: retq
6870 ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32:
6871 ; AVX512DQ-FAST: # %bb.0:
6872 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27
6873 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm29
6874 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13]
6875 ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
6876 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24
6877 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [2,5,9,12,2,5,9,12]
6878 ; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1]
6879 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15]
6880 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm0
6881 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6882 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13]
6883 ; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1]
6884 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17
6885 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <1,u,u,u,5,8,12,15>
6886 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
6887 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm12
6888 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm15
6889 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15>
6890 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2
6891 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,9,u,12,u,u,u>
6892 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm3
6893 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm30, %zmm9
6894 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,u,4,7,11,14>
6895 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8
6896 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
6897 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm4, %zmm4
6898 ; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm5
6899 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26
6900 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[0,1,0,2]
6901 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
6902 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
6903 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7]
6904 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6905 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
6906 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
6907 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
6908 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4
6909 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1
6910 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
6911 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
6912 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u]
6913 ; AVX512DQ-FAST-NEXT: vporq %ymm7, %ymm10, %ymm22
6914 ; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7
6915 ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0
6916 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
6917 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm10[2],xmm7[2],xmm10[3],xmm7[3]
6918 ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0
6919 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
6920 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm22 {%k1}
6921 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5
6922 ; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm7
6923 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
6924 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
6925 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4],xmm10[5],xmm11[6,7]
6926 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
6927 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
6928 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4,5,6],xmm8[7]
6929 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
6930 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
6931 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
6932 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
6933 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23
6934 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6935 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7]
6936 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
6937 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
6938 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6939 ; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm20
6940 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3
6941 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9
6942 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7]
6943 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
6944 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7]
6945 ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm10
6946 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6947 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
6948 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
6949 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7]
6950 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0],xmm0[1],xmm10[2,3,4,5,6,7]
6951 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u]
6952 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm25
6953 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
6954 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13
6955 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5],xmm11[6],xmm13[7]
6956 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6957 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
6958 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
6959 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
6960 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
6961 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
6962 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16
6963 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6964 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7]
6965 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6966 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
6967 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6968 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm11
6969 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7]
6970 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12
6971 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7]
6972 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
6973 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6974 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5]
6975 ; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm12
6976 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6977 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5,6],ymm12[7]
6978 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
6979 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31
6980 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
6981 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15
6982 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18
6983 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm15
6984 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7]
6985 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14
6986 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm8
6987 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6],xmm12[7]
6988 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm12
6989 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u]
6990 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
6991 ; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm8, %ymm8
6992 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm12
6993 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6994 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm15
6995 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11
6996 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
6997 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
6998 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
6999 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm15 {%k1}
7000 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7001 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7002 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7]
7003 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6
7004 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4
7005 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
7006 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
7007 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7008 ; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm11, %ymm12
7009 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7]
7010 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11
7011 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7]
7012 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
7013 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
7014 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,1,3]
7015 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
7016 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7]
7017 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
7018 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm19
7019 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm8
7020 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5
7021 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7]
7022 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0
7023 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7]
7024 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2
7025 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7]
7026 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
7027 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7]
7028 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u>
7029 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [2,11,2,11,12,5,8,9]
7030 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm21
7031 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm1
7032 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm19
7033 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
7034 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7035 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7036 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
7037 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
7038 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1
7039 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7040 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
7041 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
7042 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
7043 ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1
7044 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
7045 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
7046 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7047 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm1
7048 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
7049 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
7050 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28
7051 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
7052 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7]
7053 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
7054 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
7055 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
7056 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7057 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
7058 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
7059 ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3
7060 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm10, %xmm9
7061 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
7062 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,u,u,u,6,9,13,u>
7063 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3
7064 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3
7065 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7]
7066 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
7067 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
7068 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7]
7069 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12
7070 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11
7071 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7]
7072 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm9
7073 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
7074 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7075 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7076 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
7077 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
7078 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1
7079 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13]
7080 ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1]
7081 ; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm11
7082 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,11,14,u,u,u>
7083 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7084 ; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm30, %zmm1
7085 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
7086 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u>
7087 ; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm0, %ymm0
7088 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
7089 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
7090 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
7091 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2
7092 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm1
7093 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
7094 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
7095 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
7096 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14
7097 ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13
7098 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm9
7099 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
7100 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
7101 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
7102 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3,4,5,6,7]
7103 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7]
7104 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13
7105 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7]
7106 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0]
7107 ; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1]
7108 ; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13
7109 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
7110 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
7111 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7],ymm11[8,9,10,11,12],ymm13[13,14,15]
7112 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
7113 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm2
7114 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10
7115 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <3,u,u,u,6,10,13,u>
7116 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
7117 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
7118 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14
7119 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7]
7120 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm13, %zmm13
7121 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
7122 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7123 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7124 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
7125 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
7126 ; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11
7127 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
7128 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,8,11,15,u,u,u>
7129 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm9
7130 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
7131 ; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm11, %zmm0
7132 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1
7133 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
7134 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
7135 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
7136 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
7137 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
7138 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
7139 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
7140 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7]
7141 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
7142 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
7143 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3]
7144 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7]
7145 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
7146 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
7147 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
7148 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
7149 ; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2
7150 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
7151 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7]
7152 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
7153 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7154 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
7155 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7156 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm1
7157 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
7158 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
7159 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
7160 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm23
7161 ; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm25
7162 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm21
7163 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rsi)
7164 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rdx)
7165 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rcx)
7166 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8)
7167 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%r9)
7168 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax)
7169 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
7170 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
7171 ; AVX512DQ-FAST-NEXT: vzeroupper
7172 ; AVX512DQ-FAST-NEXT: retq
7174 ; AVX512BW-LABEL: load_i16_stride7_vf32:
7175 ; AVX512BW: # %bb.0:
7176 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7177 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7178 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
7179 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6
7180 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7
7181 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7182 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
7183 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
7184 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
7185 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
7186 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7187 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
7188 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
7189 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
7190 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
7191 ; AVX512BW-NEXT: kmovd %edi, %k1
7192 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
7193 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
7194 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7195 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7196 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
7197 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7198 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
7199 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
7200 ; AVX512BW-NEXT: kmovd %edi, %k1
7201 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
7202 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
7203 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7204 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7205 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
7206 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7207 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
7208 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
7209 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
7210 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
7211 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7212 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
7213 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
7214 ; AVX512BW-NEXT: kmovd %edi, %k2
7215 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7216 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
7217 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
7218 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7219 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7220 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
7221 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7222 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
7223 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
7224 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7225 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
7226 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
7227 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
7228 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
7229 ; AVX512BW-NEXT: kmovd %edi, %k1
7230 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
7231 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
7232 ; AVX512BW-NEXT: kmovd %edi, %k2
7233 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
7234 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
7235 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7236 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7237 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
7238 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7239 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7240 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
7241 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7242 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
7243 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
7244 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
7245 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
7246 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
7247 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
7248 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7249 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7250 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
7251 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7252 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7253 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
7254 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7255 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
7256 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
7257 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
7258 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
7259 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
7260 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
7261 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7262 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7263 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
7264 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7265 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
7266 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
7267 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7268 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
7269 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
7270 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
7271 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
7272 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
7273 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
7274 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7275 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
7276 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
7277 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7278 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
7279 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
7280 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7281 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
7282 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
7283 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
7284 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
7285 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
7286 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
7287 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx)
7288 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx)
7289 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8)
7290 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9)
7291 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
7292 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax)
7293 ; AVX512BW-NEXT: vzeroupper
7294 ; AVX512BW-NEXT: retq
7295 %wide.vec = load <224 x i16>, ptr %in.vec, align 64
7296 %strided.vec0 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
7297 %strided.vec1 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
7298 %strided.vec2 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
7299 %strided.vec3 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
7300 %strided.vec4 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
7301 %strided.vec5 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
7302 %strided.vec6 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
7303 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
7304 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
7305 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
7306 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
7307 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
7308 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
7309 store <32 x i16> %strided.vec6, ptr %out.vec6, align 64
7313 define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
7314 ; SSE-LABEL: load_i16_stride7_vf64:
7316 ; SSE-NEXT: subq $1352, %rsp # imm = 0x548
7317 ; SSE-NEXT: movdqa 640(%rdi), %xmm9
7318 ; SSE-NEXT: movdqa 624(%rdi), %xmm12
7319 ; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill
7320 ; SSE-NEXT: movdqa 112(%rdi), %xmm8
7321 ; SSE-NEXT: movdqa 128(%rdi), %xmm10
7322 ; SSE-NEXT: movaps 160(%rdi), %xmm6
7323 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7324 ; SSE-NEXT: movaps 144(%rdi), %xmm13
7325 ; SSE-NEXT: movdqa 192(%rdi), %xmm2
7326 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7327 ; SSE-NEXT: movdqa 176(%rdi), %xmm4
7328 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7329 ; SSE-NEXT: movdqa 208(%rdi), %xmm11
7330 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
7331 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7332 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
7333 ; SSE-NEXT: movdqa %xmm3, %xmm1
7334 ; SSE-NEXT: pandn %xmm0, %xmm1
7335 ; SSE-NEXT: movdqa %xmm4, %xmm0
7336 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7337 ; SSE-NEXT: pand %xmm3, %xmm0
7338 ; SSE-NEXT: por %xmm1, %xmm0
7339 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
7340 ; SSE-NEXT: movdqa %xmm14, %xmm1
7341 ; SSE-NEXT: pandn %xmm0, %xmm1
7342 ; SSE-NEXT: movaps %xmm13, %xmm0
7343 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7344 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2]
7345 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
7346 ; SSE-NEXT: movaps %xmm6, %xmm2
7347 ; SSE-NEXT: andnps %xmm0, %xmm2
7348 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
7349 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7350 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3]
7351 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7352 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
7353 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7354 ; SSE-NEXT: movdqa 656(%rdi), %xmm0
7355 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7356 ; SSE-NEXT: pand %xmm6, %xmm4
7357 ; SSE-NEXT: por %xmm2, %xmm4
7358 ; SSE-NEXT: pand %xmm14, %xmm4
7359 ; SSE-NEXT: por %xmm1, %xmm4
7360 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7361 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7362 ; SSE-NEXT: movdqa %xmm3, %xmm1
7363 ; SSE-NEXT: pandn %xmm0, %xmm1
7364 ; SSE-NEXT: movdqa %xmm12, %xmm0
7365 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
7366 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7367 ; SSE-NEXT: pand %xmm3, %xmm0
7368 ; SSE-NEXT: por %xmm1, %xmm0
7369 ; SSE-NEXT: movdqa %xmm14, %xmm1
7370 ; SSE-NEXT: pandn %xmm0, %xmm1
7371 ; SSE-NEXT: movaps 608(%rdi), %xmm2
7372 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7373 ; SSE-NEXT: movaps 592(%rdi), %xmm0
7374 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7375 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7376 ; SSE-NEXT: movaps %xmm6, %xmm2
7377 ; SSE-NEXT: andnps %xmm0, %xmm2
7378 ; SSE-NEXT: movdqa 560(%rdi), %xmm15
7379 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
7380 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7381 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7382 ; SSE-NEXT: movdqa 576(%rdi), %xmm5
7383 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
7384 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7385 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7386 ; SSE-NEXT: pand %xmm6, %xmm4
7387 ; SSE-NEXT: por %xmm2, %xmm4
7388 ; SSE-NEXT: pand %xmm14, %xmm4
7389 ; SSE-NEXT: por %xmm1, %xmm4
7390 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7391 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
7392 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7393 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7394 ; SSE-NEXT: movdqa %xmm3, %xmm1
7395 ; SSE-NEXT: pandn %xmm0, %xmm1
7396 ; SSE-NEXT: movdqa 80(%rdi), %xmm2
7397 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7398 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
7399 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7400 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7401 ; SSE-NEXT: pand %xmm3, %xmm0
7402 ; SSE-NEXT: por %xmm1, %xmm0
7403 ; SSE-NEXT: movdqa %xmm14, %xmm1
7404 ; SSE-NEXT: pandn %xmm0, %xmm1
7405 ; SSE-NEXT: movaps 32(%rdi), %xmm0
7406 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7407 ; SSE-NEXT: movaps 48(%rdi), %xmm4
7408 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7409 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
7410 ; SSE-NEXT: movaps %xmm6, %xmm2
7411 ; SSE-NEXT: andnps %xmm0, %xmm2
7412 ; SSE-NEXT: movdqa (%rdi), %xmm0
7413 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7414 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7415 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7416 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
7417 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7418 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7419 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7420 ; SSE-NEXT: pand %xmm6, %xmm4
7421 ; SSE-NEXT: por %xmm2, %xmm4
7422 ; SSE-NEXT: pand %xmm14, %xmm4
7423 ; SSE-NEXT: por %xmm1, %xmm4
7424 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7425 ; SSE-NEXT: movdqa 544(%rdi), %xmm0
7426 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7427 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7428 ; SSE-NEXT: movdqa %xmm3, %xmm1
7429 ; SSE-NEXT: pandn %xmm0, %xmm1
7430 ; SSE-NEXT: movdqa 528(%rdi), %xmm2
7431 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7432 ; SSE-NEXT: movdqa 512(%rdi), %xmm0
7433 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7434 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7435 ; SSE-NEXT: pand %xmm3, %xmm0
7436 ; SSE-NEXT: por %xmm1, %xmm0
7437 ; SSE-NEXT: movdqa %xmm14, %xmm1
7438 ; SSE-NEXT: pandn %xmm0, %xmm1
7439 ; SSE-NEXT: movaps 496(%rdi), %xmm2
7440 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7441 ; SSE-NEXT: movaps 480(%rdi), %xmm0
7442 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7443 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7444 ; SSE-NEXT: movaps %xmm6, %xmm2
7445 ; SSE-NEXT: andnps %xmm0, %xmm2
7446 ; SSE-NEXT: movdqa 448(%rdi), %xmm0
7447 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7448 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7449 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7450 ; SSE-NEXT: movdqa 464(%rdi), %xmm12
7451 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
7452 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7453 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7454 ; SSE-NEXT: pand %xmm6, %xmm4
7455 ; SSE-NEXT: por %xmm2, %xmm4
7456 ; SSE-NEXT: pand %xmm14, %xmm4
7457 ; SSE-NEXT: por %xmm1, %xmm4
7458 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7459 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
7460 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7461 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7462 ; SSE-NEXT: movdqa %xmm3, %xmm1
7463 ; SSE-NEXT: pandn %xmm0, %xmm1
7464 ; SSE-NEXT: movdqa 416(%rdi), %xmm2
7465 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7466 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
7467 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7468 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7469 ; SSE-NEXT: pand %xmm3, %xmm0
7470 ; SSE-NEXT: por %xmm1, %xmm0
7471 ; SSE-NEXT: movdqa %xmm14, %xmm1
7472 ; SSE-NEXT: pandn %xmm0, %xmm1
7473 ; SSE-NEXT: movaps 384(%rdi), %xmm2
7474 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7475 ; SSE-NEXT: movaps 368(%rdi), %xmm0
7476 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7477 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7478 ; SSE-NEXT: movaps %xmm6, %xmm2
7479 ; SSE-NEXT: andnps %xmm0, %xmm2
7480 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
7481 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7482 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7483 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7484 ; SSE-NEXT: movdqa 352(%rdi), %xmm0
7485 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7486 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7487 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7488 ; SSE-NEXT: pand %xmm6, %xmm4
7489 ; SSE-NEXT: por %xmm2, %xmm4
7490 ; SSE-NEXT: pand %xmm14, %xmm4
7491 ; SSE-NEXT: por %xmm1, %xmm4
7492 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7493 ; SSE-NEXT: movdqa 880(%rdi), %xmm0
7494 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7495 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7496 ; SSE-NEXT: movdqa %xmm3, %xmm1
7497 ; SSE-NEXT: pandn %xmm0, %xmm1
7498 ; SSE-NEXT: movdqa 864(%rdi), %xmm4
7499 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7500 ; SSE-NEXT: movdqa 848(%rdi), %xmm0
7501 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7502 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
7503 ; SSE-NEXT: pand %xmm3, %xmm0
7504 ; SSE-NEXT: por %xmm1, %xmm0
7505 ; SSE-NEXT: movdqa %xmm14, %xmm1
7506 ; SSE-NEXT: pandn %xmm0, %xmm1
7507 ; SSE-NEXT: movaps 832(%rdi), %xmm2
7508 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7509 ; SSE-NEXT: movaps 816(%rdi), %xmm0
7510 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7511 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7512 ; SSE-NEXT: movaps %xmm6, %xmm2
7513 ; SSE-NEXT: andnps %xmm0, %xmm2
7514 ; SSE-NEXT: movdqa 784(%rdi), %xmm0
7515 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7516 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7517 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7518 ; SSE-NEXT: movdqa 800(%rdi), %xmm0
7519 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7520 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7521 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7522 ; SSE-NEXT: pand %xmm6, %xmm4
7523 ; SSE-NEXT: por %xmm2, %xmm4
7524 ; SSE-NEXT: pand %xmm14, %xmm4
7525 ; SSE-NEXT: por %xmm1, %xmm4
7526 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7527 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
7528 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7529 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7530 ; SSE-NEXT: movdqa %xmm3, %xmm1
7531 ; SSE-NEXT: pandn %xmm0, %xmm1
7532 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
7533 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7534 ; SSE-NEXT: movdqa 288(%rdi), %xmm0
7535 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7536 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7537 ; SSE-NEXT: pand %xmm3, %xmm0
7538 ; SSE-NEXT: por %xmm1, %xmm0
7539 ; SSE-NEXT: movdqa %xmm14, %xmm1
7540 ; SSE-NEXT: pandn %xmm0, %xmm1
7541 ; SSE-NEXT: movaps 272(%rdi), %xmm2
7542 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7543 ; SSE-NEXT: movaps 256(%rdi), %xmm0
7544 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7545 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7546 ; SSE-NEXT: movaps %xmm6, %xmm2
7547 ; SSE-NEXT: andnps %xmm0, %xmm2
7548 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
7549 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7550 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7551 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7552 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
7553 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7554 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7555 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7556 ; SSE-NEXT: pand %xmm6, %xmm4
7557 ; SSE-NEXT: por %xmm2, %xmm4
7558 ; SSE-NEXT: pand %xmm14, %xmm4
7559 ; SSE-NEXT: por %xmm1, %xmm4
7560 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7561 ; SSE-NEXT: movdqa 768(%rdi), %xmm0
7562 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7563 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
7564 ; SSE-NEXT: movdqa %xmm3, %xmm1
7565 ; SSE-NEXT: pandn %xmm0, %xmm1
7566 ; SSE-NEXT: movdqa 752(%rdi), %xmm2
7567 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7568 ; SSE-NEXT: movdqa 736(%rdi), %xmm0
7569 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7570 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
7571 ; SSE-NEXT: pand %xmm3, %xmm0
7572 ; SSE-NEXT: por %xmm1, %xmm0
7573 ; SSE-NEXT: movdqa %xmm14, %xmm1
7574 ; SSE-NEXT: pandn %xmm0, %xmm1
7575 ; SSE-NEXT: movaps 720(%rdi), %xmm2
7576 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7577 ; SSE-NEXT: movaps 704(%rdi), %xmm0
7578 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7579 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
7580 ; SSE-NEXT: movaps %xmm6, %xmm2
7581 ; SSE-NEXT: andnps %xmm0, %xmm2
7582 ; SSE-NEXT: movdqa 672(%rdi), %xmm0
7583 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7584 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
7585 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
7586 ; SSE-NEXT: movdqa 688(%rdi), %xmm0
7587 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
7589 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7590 ; SSE-NEXT: pand %xmm6, %xmm4
7591 ; SSE-NEXT: por %xmm2, %xmm4
7592 ; SSE-NEXT: pand %xmm14, %xmm4
7593 ; SSE-NEXT: por %xmm1, %xmm4
7594 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7595 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
7596 ; SSE-NEXT: movdqa %xmm3, %xmm1
7597 ; SSE-NEXT: pandn %xmm11, %xmm1
7598 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7599 ; SSE-NEXT: psrld $16, %xmm0
7600 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7601 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7602 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
7603 ; SSE-NEXT: pand %xmm3, %xmm2
7604 ; SSE-NEXT: por %xmm1, %xmm2
7605 ; SSE-NEXT: movdqa %xmm14, %xmm1
7606 ; SSE-NEXT: pandn %xmm2, %xmm1
7607 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
7608 ; SSE-NEXT: movdqa %xmm7, %xmm2
7609 ; SSE-NEXT: pandn %xmm8, %xmm2
7610 ; SSE-NEXT: pand %xmm7, %xmm10
7611 ; SSE-NEXT: por %xmm2, %xmm10
7612 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
7613 ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
7614 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1]
7615 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7616 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7617 ; SSE-NEXT: movdqa %xmm6, %xmm0
7618 ; SSE-NEXT: pandn %xmm2, %xmm0
7619 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3]
7620 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7621 ; SSE-NEXT: pand %xmm6, %xmm2
7622 ; SSE-NEXT: por %xmm2, %xmm0
7623 ; SSE-NEXT: pand %xmm14, %xmm0
7624 ; SSE-NEXT: por %xmm1, %xmm0
7625 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7626 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7627 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7628 ; SSE-NEXT: movdqa %xmm3, %xmm2
7629 ; SSE-NEXT: pandn %xmm1, %xmm2
7630 ; SSE-NEXT: psrld $16, %xmm9
7631 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
7632 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7633 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
7634 ; SSE-NEXT: pand %xmm3, %xmm4
7635 ; SSE-NEXT: por %xmm2, %xmm4
7636 ; SSE-NEXT: movdqa %xmm14, %xmm1
7637 ; SSE-NEXT: pandn %xmm4, %xmm1
7638 ; SSE-NEXT: movdqa %xmm7, %xmm2
7639 ; SSE-NEXT: pandn %xmm15, %xmm2
7640 ; SSE-NEXT: pand %xmm7, %xmm5
7641 ; SSE-NEXT: por %xmm2, %xmm5
7642 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7643 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7644 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7645 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7646 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7647 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7648 ; SSE-NEXT: movdqa %xmm6, %xmm0
7649 ; SSE-NEXT: pandn %xmm2, %xmm0
7650 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3]
7651 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7652 ; SSE-NEXT: pand %xmm6, %xmm2
7653 ; SSE-NEXT: por %xmm2, %xmm0
7654 ; SSE-NEXT: pand %xmm14, %xmm0
7655 ; SSE-NEXT: por %xmm1, %xmm0
7656 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7657 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7658 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7659 ; SSE-NEXT: movdqa %xmm3, %xmm2
7660 ; SSE-NEXT: pandn %xmm1, %xmm2
7661 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7662 ; SSE-NEXT: psrld $16, %xmm1
7663 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7664 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7665 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7666 ; SSE-NEXT: pand %xmm3, %xmm4
7667 ; SSE-NEXT: por %xmm2, %xmm4
7668 ; SSE-NEXT: movdqa %xmm14, %xmm1
7669 ; SSE-NEXT: pandn %xmm4, %xmm1
7670 ; SSE-NEXT: movdqa %xmm7, %xmm2
7671 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7672 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7673 ; SSE-NEXT: pand %xmm7, %xmm4
7674 ; SSE-NEXT: por %xmm2, %xmm4
7675 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7676 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7677 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7678 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7679 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7680 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7681 ; SSE-NEXT: movdqa %xmm6, %xmm0
7682 ; SSE-NEXT: pandn %xmm2, %xmm0
7683 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7684 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7685 ; SSE-NEXT: pand %xmm6, %xmm2
7686 ; SSE-NEXT: por %xmm2, %xmm0
7687 ; SSE-NEXT: pand %xmm14, %xmm0
7688 ; SSE-NEXT: por %xmm1, %xmm0
7689 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7690 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7691 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7692 ; SSE-NEXT: movdqa %xmm3, %xmm2
7693 ; SSE-NEXT: pandn %xmm1, %xmm2
7694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7695 ; SSE-NEXT: psrld $16, %xmm1
7696 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7697 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7698 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7699 ; SSE-NEXT: pand %xmm3, %xmm4
7700 ; SSE-NEXT: por %xmm2, %xmm4
7701 ; SSE-NEXT: movdqa %xmm14, %xmm1
7702 ; SSE-NEXT: pandn %xmm4, %xmm1
7703 ; SSE-NEXT: movdqa %xmm7, %xmm2
7704 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7705 ; SSE-NEXT: pand %xmm7, %xmm12
7706 ; SSE-NEXT: por %xmm2, %xmm12
7707 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7708 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7709 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7710 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7711 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7712 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7713 ; SSE-NEXT: movdqa %xmm6, %xmm0
7714 ; SSE-NEXT: pandn %xmm2, %xmm0
7715 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3]
7716 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7717 ; SSE-NEXT: pand %xmm6, %xmm2
7718 ; SSE-NEXT: por %xmm2, %xmm0
7719 ; SSE-NEXT: pand %xmm14, %xmm0
7720 ; SSE-NEXT: por %xmm1, %xmm0
7721 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7722 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7723 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7724 ; SSE-NEXT: movdqa %xmm3, %xmm2
7725 ; SSE-NEXT: pandn %xmm1, %xmm2
7726 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7727 ; SSE-NEXT: psrld $16, %xmm1
7728 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7729 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7730 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7731 ; SSE-NEXT: pand %xmm3, %xmm4
7732 ; SSE-NEXT: por %xmm2, %xmm4
7733 ; SSE-NEXT: movdqa %xmm14, %xmm1
7734 ; SSE-NEXT: pandn %xmm4, %xmm1
7735 ; SSE-NEXT: movdqa %xmm7, %xmm2
7736 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7737 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7738 ; SSE-NEXT: pand %xmm7, %xmm4
7739 ; SSE-NEXT: por %xmm2, %xmm4
7740 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7741 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7742 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7743 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7744 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7745 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7746 ; SSE-NEXT: movdqa %xmm6, %xmm0
7747 ; SSE-NEXT: pandn %xmm2, %xmm0
7748 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7749 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7750 ; SSE-NEXT: pand %xmm6, %xmm2
7751 ; SSE-NEXT: por %xmm2, %xmm0
7752 ; SSE-NEXT: pand %xmm14, %xmm0
7753 ; SSE-NEXT: por %xmm1, %xmm0
7754 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7755 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7756 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7757 ; SSE-NEXT: movdqa %xmm3, %xmm2
7758 ; SSE-NEXT: pandn %xmm1, %xmm2
7759 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7760 ; SSE-NEXT: psrld $16, %xmm1
7761 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7762 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7763 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7764 ; SSE-NEXT: pand %xmm3, %xmm4
7765 ; SSE-NEXT: por %xmm2, %xmm4
7766 ; SSE-NEXT: movdqa %xmm14, %xmm1
7767 ; SSE-NEXT: pandn %xmm4, %xmm1
7768 ; SSE-NEXT: movdqa %xmm7, %xmm2
7769 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7770 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7771 ; SSE-NEXT: pand %xmm7, %xmm4
7772 ; SSE-NEXT: por %xmm2, %xmm4
7773 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7774 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7775 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7776 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7777 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7778 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7779 ; SSE-NEXT: movdqa %xmm6, %xmm0
7780 ; SSE-NEXT: pandn %xmm2, %xmm0
7781 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7782 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7783 ; SSE-NEXT: pand %xmm6, %xmm2
7784 ; SSE-NEXT: por %xmm2, %xmm0
7785 ; SSE-NEXT: pand %xmm14, %xmm0
7786 ; SSE-NEXT: por %xmm1, %xmm0
7787 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7788 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7789 ; SSE-NEXT: movdqa %xmm10, %xmm1
7790 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7791 ; SSE-NEXT: movdqa %xmm3, %xmm2
7792 ; SSE-NEXT: pandn %xmm1, %xmm2
7793 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7794 ; SSE-NEXT: movdqa %xmm11, %xmm1
7795 ; SSE-NEXT: psrld $16, %xmm1
7796 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7797 ; SSE-NEXT: movdqa %xmm9, %xmm4
7798 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7799 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7800 ; SSE-NEXT: pand %xmm3, %xmm4
7801 ; SSE-NEXT: por %xmm2, %xmm4
7802 ; SSE-NEXT: movdqa %xmm14, %xmm1
7803 ; SSE-NEXT: pandn %xmm4, %xmm1
7804 ; SSE-NEXT: movdqa %xmm7, %xmm2
7805 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7806 ; SSE-NEXT: pandn %xmm12, %xmm2
7807 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7808 ; SSE-NEXT: movdqa %xmm13, %xmm4
7809 ; SSE-NEXT: pand %xmm7, %xmm4
7810 ; SSE-NEXT: por %xmm2, %xmm4
7811 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7812 ; SSE-NEXT: movdqa %xmm5, %xmm2
7813 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7814 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
7815 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
7816 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7817 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
7818 ; SSE-NEXT: movdqa %xmm6, %xmm15
7819 ; SSE-NEXT: pandn %xmm2, %xmm15
7820 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7821 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7822 ; SSE-NEXT: pand %xmm6, %xmm2
7823 ; SSE-NEXT: por %xmm2, %xmm15
7824 ; SSE-NEXT: pand %xmm14, %xmm15
7825 ; SSE-NEXT: por %xmm1, %xmm15
7826 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7827 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7828 ; SSE-NEXT: movdqa %xmm15, %xmm1
7829 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
7830 ; SSE-NEXT: movdqa %xmm3, %xmm2
7831 ; SSE-NEXT: pandn %xmm1, %xmm2
7832 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7833 ; SSE-NEXT: psrld $16, %xmm1
7834 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7835 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7836 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7837 ; SSE-NEXT: pand %xmm3, %xmm4
7838 ; SSE-NEXT: por %xmm2, %xmm4
7839 ; SSE-NEXT: movdqa %xmm14, %xmm1
7840 ; SSE-NEXT: pandn %xmm4, %xmm1
7841 ; SSE-NEXT: movdqa %xmm7, %xmm2
7842 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7843 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7844 ; SSE-NEXT: pand %xmm7, %xmm4
7845 ; SSE-NEXT: por %xmm2, %xmm4
7846 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7847 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7848 ; SSE-NEXT: pand %xmm6, %xmm2
7849 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7850 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7851 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
7852 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
7853 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
7854 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
7855 ; SSE-NEXT: pandn %xmm4, %xmm6
7856 ; SSE-NEXT: por %xmm2, %xmm6
7857 ; SSE-NEXT: pand %xmm14, %xmm6
7858 ; SSE-NEXT: por %xmm1, %xmm6
7859 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7860 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7861 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
7862 ; SSE-NEXT: movdqa %xmm3, %xmm2
7863 ; SSE-NEXT: pandn %xmm1, %xmm2
7864 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7865 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7866 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
7867 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
7868 ; SSE-NEXT: pand %xmm3, %xmm1
7869 ; SSE-NEXT: por %xmm2, %xmm1
7870 ; SSE-NEXT: movdqa %xmm14, %xmm2
7871 ; SSE-NEXT: pandn %xmm1, %xmm2
7872 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7873 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
7874 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
7875 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7876 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
7877 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
7878 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7879 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7880 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
7881 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7882 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
7883 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7884 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
7885 ; SSE-NEXT: andps %xmm14, %xmm4
7886 ; SSE-NEXT: orps %xmm2, %xmm4
7887 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7888 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7889 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
7890 ; SSE-NEXT: movdqa %xmm3, %xmm2
7891 ; SSE-NEXT: pandn %xmm1, %xmm2
7892 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7893 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7894 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
7895 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
7896 ; SSE-NEXT: pand %xmm3, %xmm1
7897 ; SSE-NEXT: por %xmm2, %xmm1
7898 ; SSE-NEXT: movdqa %xmm14, %xmm2
7899 ; SSE-NEXT: pandn %xmm1, %xmm2
7900 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7901 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
7902 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7]
7903 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7904 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
7905 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1]
7906 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7907 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7908 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
7909 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7910 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
7911 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
7912 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3]
7913 ; SSE-NEXT: andps %xmm14, %xmm6
7914 ; SSE-NEXT: orps %xmm2, %xmm6
7915 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7916 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
7917 ; SSE-NEXT: movdqa %xmm3, %xmm4
7918 ; SSE-NEXT: pandn %xmm2, %xmm4
7919 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
7920 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
7921 ; SSE-NEXT: pand %xmm3, %xmm2
7922 ; SSE-NEXT: por %xmm4, %xmm2
7923 ; SSE-NEXT: movdqa %xmm14, %xmm4
7924 ; SSE-NEXT: pandn %xmm2, %xmm4
7925 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
7926 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
7927 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
7928 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7929 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
7930 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7931 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3]
7932 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
7933 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
7934 ; SSE-NEXT: andps %xmm14, %xmm0
7935 ; SSE-NEXT: orps %xmm4, %xmm0
7936 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7937 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7938 ; SSE-NEXT: # xmm2 = mem[0,1,0,1]
7939 ; SSE-NEXT: movdqa %xmm3, %xmm4
7940 ; SSE-NEXT: pandn %xmm2, %xmm4
7941 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
7942 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7943 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
7944 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
7945 ; SSE-NEXT: pand %xmm3, %xmm2
7946 ; SSE-NEXT: por %xmm4, %xmm2
7947 ; SSE-NEXT: movdqa %xmm14, %xmm4
7948 ; SSE-NEXT: pandn %xmm2, %xmm4
7949 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7950 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
7951 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
7952 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7953 ; SSE-NEXT: # xmm2 = mem[2,2,3,3]
7954 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7955 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7956 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7957 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7958 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
7959 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
7960 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
7961 ; SSE-NEXT: andps %xmm14, %xmm0
7962 ; SSE-NEXT: orps %xmm4, %xmm0
7963 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7964 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7965 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
7966 ; SSE-NEXT: movdqa %xmm3, %xmm6
7967 ; SSE-NEXT: pandn %xmm4, %xmm6
7968 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7969 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7970 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
7971 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
7972 ; SSE-NEXT: pand %xmm3, %xmm4
7973 ; SSE-NEXT: por %xmm6, %xmm4
7974 ; SSE-NEXT: movdqa %xmm14, %xmm6
7975 ; SSE-NEXT: pandn %xmm4, %xmm6
7976 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7977 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
7978 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
7979 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7980 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
7981 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
7982 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7983 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
7984 ; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
7985 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3]
7986 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
7987 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
7988 ; SSE-NEXT: andps %xmm14, %xmm0
7989 ; SSE-NEXT: orps %xmm6, %xmm0
7990 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7991 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7992 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
7993 ; SSE-NEXT: movdqa %xmm3, %xmm6
7994 ; SSE-NEXT: pandn %xmm4, %xmm6
7995 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7996 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
7997 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
7998 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
7999 ; SSE-NEXT: pand %xmm3, %xmm4
8000 ; SSE-NEXT: por %xmm6, %xmm4
8001 ; SSE-NEXT: movdqa %xmm14, %xmm6
8002 ; SSE-NEXT: pandn %xmm4, %xmm6
8003 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8004 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
8005 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
8006 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8007 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
8008 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
8009 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8010 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8011 ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
8012 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3]
8013 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
8014 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3]
8015 ; SSE-NEXT: andps %xmm14, %xmm0
8016 ; SSE-NEXT: orps %xmm6, %xmm0
8017 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8018 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1]
8019 ; SSE-NEXT: movdqa %xmm3, %xmm10
8020 ; SSE-NEXT: pandn %xmm6, %xmm10
8021 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8022 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8023 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
8024 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
8025 ; SSE-NEXT: pand %xmm3, %xmm6
8026 ; SSE-NEXT: por %xmm10, %xmm6
8027 ; SSE-NEXT: movdqa %xmm14, %xmm11
8028 ; SSE-NEXT: pandn %xmm6, %xmm11
8029 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8030 ; SSE-NEXT: # xmm6 = mem[0,1,0,3]
8031 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7]
8032 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8033 ; SSE-NEXT: # xmm6 = mem[2,2,3,3]
8034 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1]
8035 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8036 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
8037 ; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3]
8038 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3]
8039 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
8040 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
8041 ; SSE-NEXT: andps %xmm14, %xmm0
8042 ; SSE-NEXT: orps %xmm11, %xmm0
8043 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8044 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8045 ; SSE-NEXT: movdqa %xmm0, %xmm6
8046 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8047 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
8048 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
8049 ; SSE-NEXT: pand %xmm3, %xmm6
8050 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8051 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
8052 ; SSE-NEXT: pandn %xmm11, %xmm3
8053 ; SSE-NEXT: por %xmm6, %xmm3
8054 ; SSE-NEXT: movdqa %xmm14, %xmm6
8055 ; SSE-NEXT: pandn %xmm3, %xmm6
8056 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8057 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3]
8058 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7]
8059 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8060 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
8061 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1]
8062 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8063 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8064 ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
8065 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3]
8066 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
8067 ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3]
8068 ; SSE-NEXT: andps %xmm14, %xmm5
8069 ; SSE-NEXT: orps %xmm6, %xmm5
8070 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8071 ; SSE-NEXT: movdqa %xmm7, %xmm6
8072 ; SSE-NEXT: pandn %xmm13, %xmm6
8073 ; SSE-NEXT: movdqa %xmm15, %xmm11
8074 ; SSE-NEXT: pand %xmm7, %xmm11
8075 ; SSE-NEXT: por %xmm6, %xmm11
8076 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7]
8077 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
8078 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
8079 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
8080 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3]
8081 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
8082 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
8083 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
8084 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
8085 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
8086 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
8087 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
8088 ; SSE-NEXT: movdqa %xmm14, %xmm0
8089 ; SSE-NEXT: pandn %xmm3, %xmm0
8090 ; SSE-NEXT: andps %xmm14, %xmm6
8091 ; SSE-NEXT: por %xmm6, %xmm0
8092 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8093 ; SSE-NEXT: movdqa %xmm7, %xmm3
8094 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8095 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8096 ; SSE-NEXT: movdqa %xmm9, %xmm6
8097 ; SSE-NEXT: pand %xmm7, %xmm6
8098 ; SSE-NEXT: por %xmm3, %xmm6
8099 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7]
8100 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
8101 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8102 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
8103 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
8104 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
8105 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8106 ; SSE-NEXT: movdqa %xmm0, %xmm5
8107 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8108 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
8109 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
8110 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
8111 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8112 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
8113 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
8114 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
8115 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
8116 ; SSE-NEXT: movdqa %xmm14, %xmm1
8117 ; SSE-NEXT: pandn %xmm5, %xmm1
8118 ; SSE-NEXT: andps %xmm14, %xmm3
8119 ; SSE-NEXT: por %xmm3, %xmm1
8120 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8121 ; SSE-NEXT: movdqa %xmm7, %xmm3
8122 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8123 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8124 ; SSE-NEXT: pand %xmm7, %xmm5
8125 ; SSE-NEXT: por %xmm3, %xmm5
8126 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
8127 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
8128 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8129 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
8130 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
8131 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
8132 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8133 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8134 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
8135 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
8136 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
8137 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8138 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
8139 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
8140 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
8141 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
8142 ; SSE-NEXT: movdqa %xmm14, %xmm1
8143 ; SSE-NEXT: pandn %xmm5, %xmm1
8144 ; SSE-NEXT: andps %xmm14, %xmm3
8145 ; SSE-NEXT: por %xmm3, %xmm1
8146 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8147 ; SSE-NEXT: movdqa %xmm7, %xmm3
8148 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8149 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8150 ; SSE-NEXT: pand %xmm7, %xmm5
8151 ; SSE-NEXT: por %xmm3, %xmm5
8152 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
8153 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
8154 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8155 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
8156 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
8157 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
8158 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8159 ; SSE-NEXT: movdqa %xmm11, %xmm1
8160 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8161 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
8162 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8163 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
8164 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8165 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
8166 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
8167 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8168 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
8169 ; SSE-NEXT: movdqa %xmm14, %xmm13
8170 ; SSE-NEXT: pandn %xmm1, %xmm13
8171 ; SSE-NEXT: andps %xmm14, %xmm3
8172 ; SSE-NEXT: por %xmm3, %xmm13
8173 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8174 ; SSE-NEXT: movdqa %xmm7, %xmm1
8175 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8176 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8177 ; SSE-NEXT: pand %xmm7, %xmm3
8178 ; SSE-NEXT: por %xmm1, %xmm3
8179 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
8180 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8181 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7]
8182 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
8183 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
8184 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8185 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8186 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8187 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
8188 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
8189 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8190 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8191 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
8192 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
8193 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
8194 ; SSE-NEXT: movdqa %xmm14, %xmm13
8195 ; SSE-NEXT: pandn %xmm3, %xmm13
8196 ; SSE-NEXT: andps %xmm14, %xmm1
8197 ; SSE-NEXT: por %xmm1, %xmm13
8198 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8199 ; SSE-NEXT: movdqa %xmm7, %xmm1
8200 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8201 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8202 ; SSE-NEXT: pand %xmm7, %xmm3
8203 ; SSE-NEXT: por %xmm1, %xmm3
8204 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
8205 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8206 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
8207 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8208 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8209 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8210 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8211 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8212 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8213 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8214 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8215 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8216 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8217 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8218 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8219 ; SSE-NEXT: movdqa %xmm14, %xmm3
8220 ; SSE-NEXT: pandn %xmm2, %xmm3
8221 ; SSE-NEXT: andps %xmm14, %xmm1
8222 ; SSE-NEXT: por %xmm1, %xmm3
8223 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8224 ; SSE-NEXT: movdqa %xmm7, %xmm1
8225 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8226 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8227 ; SSE-NEXT: pand %xmm7, %xmm2
8228 ; SSE-NEXT: por %xmm1, %xmm2
8229 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7]
8230 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8231 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7]
8232 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8233 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8234 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8235 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8236 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8237 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8238 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8239 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8240 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8241 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8242 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8243 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8244 ; SSE-NEXT: movdqa %xmm14, %xmm10
8245 ; SSE-NEXT: pandn %xmm2, %xmm10
8246 ; SSE-NEXT: andps %xmm14, %xmm1
8247 ; SSE-NEXT: por %xmm1, %xmm10
8248 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8249 ; SSE-NEXT: movdqa %xmm7, %xmm1
8250 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8251 ; SSE-NEXT: pand %xmm7, %xmm12
8252 ; SSE-NEXT: por %xmm1, %xmm12
8253 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7]
8254 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8255 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
8256 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
8257 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
8258 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8259 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8260 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8261 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8262 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
8263 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8264 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8265 ; SSE-NEXT: andps %xmm14, %xmm1
8266 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
8267 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
8268 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
8269 ; SSE-NEXT: pandn %xmm2, %xmm14
8270 ; SSE-NEXT: por %xmm1, %xmm14
8271 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8272 ; SSE-NEXT: movdqa %xmm2, %xmm1
8273 ; SSE-NEXT: psrld $16, %xmm1
8274 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8275 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
8276 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8277 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8278 ; SSE-NEXT: movdqa %xmm2, %xmm4
8279 ; SSE-NEXT: movdqa %xmm10, %xmm2
8280 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
8281 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8282 ; SSE-NEXT: psrlq $48, %xmm2
8283 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8285 ; SSE-NEXT: psrlq $16, %xmm2
8286 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8287 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8288 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8289 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8290 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8291 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8292 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8293 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8294 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8295 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8296 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8297 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8298 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8299 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8300 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
8301 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
8302 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8303 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8304 ; SSE-NEXT: movdqa %xmm13, %xmm1
8305 ; SSE-NEXT: psrld $16, %xmm1
8306 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8307 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8308 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8309 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
8310 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
8311 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8312 ; SSE-NEXT: psrlq $48, %xmm2
8313 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8314 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8315 ; SSE-NEXT: psrlq $16, %xmm2
8316 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
8317 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8318 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8319 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
8320 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8321 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
8322 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8323 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8324 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8325 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8326 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8327 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8328 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8329 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8330 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8331 ; SSE-NEXT: movdqa %xmm0, %xmm1
8332 ; SSE-NEXT: psrld $16, %xmm1
8333 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8334 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8335 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8336 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8337 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8338 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8339 ; SSE-NEXT: psrlq $48, %xmm2
8340 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8341 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8342 ; SSE-NEXT: psrlq $16, %xmm2
8343 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8344 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8345 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8346 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8347 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8348 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8349 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8350 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8351 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8352 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8353 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8354 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8355 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8356 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8357 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8358 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8359 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8360 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8361 ; SSE-NEXT: movdqa %xmm0, %xmm1
8362 ; SSE-NEXT: psrld $16, %xmm1
8363 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8364 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8365 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8366 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8367 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8368 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8369 ; SSE-NEXT: psrlq $48, %xmm2
8370 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8371 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8372 ; SSE-NEXT: psrlq $16, %xmm2
8373 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8374 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8375 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8376 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8377 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3]
8378 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8379 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
8380 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
8381 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8382 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8383 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8384 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8385 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8386 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8387 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8388 ; SSE-NEXT: movdqa %xmm2, %xmm1
8389 ; SSE-NEXT: psrld $16, %xmm1
8390 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8391 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8392 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8393 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8394 ; SSE-NEXT: movdqa %xmm2, %xmm4
8395 ; SSE-NEXT: movdqa %xmm0, %xmm2
8396 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
8397 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8398 ; SSE-NEXT: psrlq $48, %xmm2
8399 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8400 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8401 ; SSE-NEXT: psrlq $16, %xmm2
8402 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8403 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8404 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8405 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8406 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8407 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8408 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8409 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
8410 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8411 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8412 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8413 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8414 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8415 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8416 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8417 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8418 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8419 ; SSE-NEXT: movdqa %xmm12, %xmm13
8420 ; SSE-NEXT: psrld $16, %xmm13
8421 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8422 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
8423 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
8424 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
8425 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8426 ; SSE-NEXT: psrlq $48, %xmm2
8427 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8428 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8429 ; SSE-NEXT: psrlq $16, %xmm2
8430 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8431 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
8432 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8433 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8434 ; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload
8435 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8436 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8437 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8438 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8439 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8440 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8441 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8442 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8443 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8444 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8445 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8446 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8447 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8448 ; SSE-NEXT: movdqa %xmm0, %xmm12
8449 ; SSE-NEXT: psrld $16, %xmm12
8450 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8451 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
8452 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
8453 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8454 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8455 ; SSE-NEXT: psrlq $48, %xmm2
8456 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8457 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8458 ; SSE-NEXT: movdqa %xmm9, %xmm2
8459 ; SSE-NEXT: psrlq $16, %xmm2
8460 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8461 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
8462 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8463 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8464 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8465 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8466 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8467 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8468 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8469 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8470 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8471 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8472 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8473 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8474 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8475 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8476 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8477 ; SSE-NEXT: movdqa %xmm2, %xmm11
8478 ; SSE-NEXT: psrld $16, %xmm11
8479 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8480 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
8481 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
8482 ; SSE-NEXT: movdqa %xmm2, %xmm3
8483 ; SSE-NEXT: movdqa %xmm0, %xmm2
8484 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8485 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8486 ; SSE-NEXT: psrlq $48, %xmm2
8487 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8488 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8489 ; SSE-NEXT: movdqa %xmm4, %xmm2
8490 ; SSE-NEXT: psrlq $16, %xmm2
8491 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8492 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
8493 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8494 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8495 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8496 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
8497 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8498 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8499 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8500 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8501 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8502 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
8503 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
8504 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8505 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
8506 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
8507 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8508 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8509 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8510 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8511 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8512 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8513 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8514 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8515 ; SSE-NEXT: movdqa %xmm7, %xmm1
8516 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8517 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8518 ; SSE-NEXT: pand %xmm7, %xmm2
8519 ; SSE-NEXT: por %xmm1, %xmm2
8520 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8521 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8522 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8523 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8524 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8525 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8526 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8527 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8528 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8529 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8530 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8531 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8532 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8533 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8534 ; SSE-NEXT: movdqa %xmm7, %xmm1
8535 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8536 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8537 ; SSE-NEXT: pand %xmm7, %xmm2
8538 ; SSE-NEXT: por %xmm1, %xmm2
8539 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8540 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8541 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8542 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8543 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8544 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8545 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8546 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8547 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8548 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8549 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8550 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8551 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8552 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8553 ; SSE-NEXT: movdqa %xmm7, %xmm1
8554 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8555 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8556 ; SSE-NEXT: pand %xmm7, %xmm2
8557 ; SSE-NEXT: por %xmm1, %xmm2
8558 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8559 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8560 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8561 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8562 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8563 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8564 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8565 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8566 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8567 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
8568 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8569 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8570 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8571 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8572 ; SSE-NEXT: movdqa %xmm7, %xmm1
8573 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8574 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8575 ; SSE-NEXT: pand %xmm7, %xmm2
8576 ; SSE-NEXT: por %xmm1, %xmm2
8577 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8578 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8579 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8580 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8581 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8582 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
8583 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8584 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8585 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
8586 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8587 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8588 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
8589 ; SSE-NEXT: movdqa %xmm7, %xmm1
8590 ; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload
8591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8592 ; SSE-NEXT: pand %xmm7, %xmm2
8593 ; SSE-NEXT: por %xmm1, %xmm2
8594 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8595 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8596 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8597 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8598 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8599 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
8600 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8601 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7]
8602 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8603 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
8604 ; SSE-NEXT: movdqa %xmm7, %xmm1
8605 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8606 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8607 ; SSE-NEXT: pand %xmm7, %xmm2
8608 ; SSE-NEXT: por %xmm1, %xmm2
8609 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8610 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8611 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8612 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8613 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8614 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2]
8615 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
8616 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7]
8617 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8618 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
8619 ; SSE-NEXT: movdqa %xmm7, %xmm1
8620 ; SSE-NEXT: pandn %xmm5, %xmm1
8621 ; SSE-NEXT: pand %xmm7, %xmm3
8622 ; SSE-NEXT: por %xmm1, %xmm3
8623 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3]
8624 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
8625 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8626 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8627 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
8628 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8629 ; SSE-NEXT: movdqa %xmm3, %xmm1
8630 ; SSE-NEXT: pand %xmm7, %xmm1
8631 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8632 ; SSE-NEXT: pandn %xmm5, %xmm7
8633 ; SSE-NEXT: por %xmm1, %xmm7
8634 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8635 ; SSE-NEXT: movdqa %xmm4, %xmm1
8636 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8637 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
8638 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8639 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
8640 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8641 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
8642 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
8643 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
8644 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8645 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
8646 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
8647 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2]
8648 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8649 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
8650 ; SSE-NEXT: movdqa %xmm4, %xmm1
8651 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8652 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8653 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8654 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8655 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8656 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8657 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8658 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7]
8659 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
8660 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3]
8661 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2]
8662 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8663 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
8664 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
8665 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8666 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8667 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8668 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8669 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8670 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8671 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8672 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8673 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8674 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8675 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8676 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8677 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8678 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8679 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8680 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8681 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
8682 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8683 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8684 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
8685 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8686 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8687 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8688 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8689 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8690 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8691 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8692 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8693 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8694 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8695 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8696 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8697 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8698 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8699 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8700 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8701 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2]
8702 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8703 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
8704 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
8705 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8706 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8707 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8708 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8709 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8710 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8711 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8712 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8713 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8714 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8715 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8716 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8717 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8718 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8719 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8720 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8721 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
8722 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8723 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8724 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
8725 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8726 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8727 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8728 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8729 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8730 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8731 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8732 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8733 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8734 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8735 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8736 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8737 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8738 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8739 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8740 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8741 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2]
8742 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8743 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
8744 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
8745 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8746 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8747 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8748 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8749 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8750 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8751 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8752 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8753 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8754 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8755 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8756 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8757 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8758 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8759 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8760 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
8761 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
8762 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8763 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
8764 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
8765 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8766 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8767 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8768 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8769 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8770 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
8771 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8772 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
8773 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
8774 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8775 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8776 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
8777 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8778 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
8779 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8780 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
8781 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
8782 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8783 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
8784 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
8785 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
8786 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8787 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8788 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8789 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
8790 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8791 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
8792 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
8793 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8794 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8795 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
8796 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8797 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8798 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8799 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
8800 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
8801 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7]
8802 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8803 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
8804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8805 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
8806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8807 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
8808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8809 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
8810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8811 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
8812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8813 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
8814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8815 ; SSE-NEXT: movaps %xmm1, (%rsi)
8816 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8817 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
8818 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8819 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
8820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8821 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
8822 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8823 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
8824 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8825 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
8826 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8827 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
8828 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8829 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
8830 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8831 ; SSE-NEXT: movaps %xmm1, (%rdx)
8832 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8833 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
8834 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8835 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
8836 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8837 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
8838 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8839 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
8840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8841 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
8842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8843 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
8844 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8845 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
8846 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8847 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
8848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8849 ; SSE-NEXT: movaps %xmm1, (%rcx)
8850 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8851 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
8852 ; SSE-NEXT: movdqa %xmm14, 112(%r8)
8853 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8854 ; SSE-NEXT: movaps %xmm1, 96(%r8)
8855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8856 ; SSE-NEXT: movaps %xmm1, 80(%r8)
8857 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8858 ; SSE-NEXT: movaps %xmm1, 64(%r8)
8859 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8860 ; SSE-NEXT: movaps %xmm1, 48(%r8)
8861 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8862 ; SSE-NEXT: movaps %xmm1, 32(%r8)
8863 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8864 ; SSE-NEXT: movaps %xmm1, 16(%r8)
8865 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8866 ; SSE-NEXT: movaps %xmm1, (%r8)
8867 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8868 ; SSE-NEXT: movaps %xmm1, 112(%r9)
8869 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8870 ; SSE-NEXT: movaps %xmm1, 96(%r9)
8871 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8872 ; SSE-NEXT: movaps %xmm1, 80(%r9)
8873 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8874 ; SSE-NEXT: movaps %xmm1, 64(%r9)
8875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8876 ; SSE-NEXT: movaps %xmm1, 48(%r9)
8877 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8878 ; SSE-NEXT: movaps %xmm1, 32(%r9)
8879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8880 ; SSE-NEXT: movaps %xmm1, 16(%r9)
8881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8882 ; SSE-NEXT: movaps %xmm1, (%r9)
8883 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8884 ; SSE-NEXT: movaps %xmm11, 112(%rax)
8885 ; SSE-NEXT: movaps %xmm12, 96(%rax)
8886 ; SSE-NEXT: movaps %xmm13, 80(%rax)
8887 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8888 ; SSE-NEXT: movaps %xmm1, 64(%rax)
8889 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8890 ; SSE-NEXT: movaps %xmm1, 48(%rax)
8891 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8892 ; SSE-NEXT: movaps %xmm1, 32(%rax)
8893 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8894 ; SSE-NEXT: movaps %xmm1, 16(%rax)
8895 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8896 ; SSE-NEXT: movaps %xmm1, (%rax)
8897 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8898 ; SSE-NEXT: movapd %xmm0, 112(%rax)
8899 ; SSE-NEXT: movapd %xmm4, 96(%rax)
8900 ; SSE-NEXT: movapd %xmm5, 80(%rax)
8901 ; SSE-NEXT: movapd %xmm6, 64(%rax)
8902 ; SSE-NEXT: movapd %xmm7, 48(%rax)
8903 ; SSE-NEXT: movapd %xmm8, 32(%rax)
8904 ; SSE-NEXT: movapd %xmm9, 16(%rax)
8905 ; SSE-NEXT: movapd %xmm10, (%rax)
8906 ; SSE-NEXT: addq $1352, %rsp # imm = 0x548
8909 ; AVX1-ONLY-LABEL: load_i16_stride7_vf64:
8910 ; AVX1-ONLY: # %bb.0:
8911 ; AVX1-ONLY-NEXT: subq $1544, %rsp # imm = 0x608
8912 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0
8913 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8914 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0
8915 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1
8916 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8917 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
8918 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8919 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
8920 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8921 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1
8922 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
8923 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8924 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8925 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8926 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2
8927 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8928 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1
8929 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8930 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8931 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
8932 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
8933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
8934 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7
8935 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0
8936 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8937 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
8938 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8939 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
8940 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8941 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
8942 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
8943 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
8944 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8945 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
8946 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
8947 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8948 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3
8949 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8950 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0
8951 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8952 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8953 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
8954 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
8955 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8956 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3
8957 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8958 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6
8959 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero
8960 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8961 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
8962 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
8963 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2
8964 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3
8965 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
8966 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8967 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
8968 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8969 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1
8970 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8971 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
8972 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2
8973 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
8974 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
8975 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8976 ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2
8977 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8978 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
8979 ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm3
8980 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8981 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8982 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
8983 ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2
8984 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8985 ; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3
8986 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8987 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8988 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
8989 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8990 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
8991 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm2
8992 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8993 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
8994 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8995 ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm3
8996 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8997 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
8998 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8999 ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5
9000 ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15
9001 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero
9002 ; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9003 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9004 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
9005 ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4
9006 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9007 ; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3
9008 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9009 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
9010 ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4
9011 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9012 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
9013 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
9014 ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm4
9015 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9016 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9017 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
9018 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9019 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
9020 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
9021 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
9022 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9023 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
9024 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9025 ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1
9026 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9027 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
9028 ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2
9029 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9030 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
9031 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9032 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2
9033 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9034 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
9035 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3
9036 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9037 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
9038 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
9039 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3
9040 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9041 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2
9042 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9043 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
9044 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9045 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9046 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
9047 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10
9048 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3]
9049 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9050 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
9051 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12
9052 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3]
9053 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9054 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
9055 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9
9056 ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3
9057 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9058 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero
9059 ; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9060 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
9061 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4
9062 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9063 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3
9064 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9065 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
9066 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4
9067 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9068 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
9069 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
9070 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11
9071 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
9072 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9073 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
9074 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9075 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
9076 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
9077 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
9078 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9079 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
9080 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9081 ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1
9082 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9083 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1
9084 ; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2
9085 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9086 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
9087 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9088 ; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2
9089 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9090 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2
9091 ; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3
9092 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9093 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
9094 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
9095 ; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2
9096 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9097 ; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13
9098 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
9099 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9100 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9101 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9102 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
9103 ; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm2
9104 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9105 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
9106 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
9107 ; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3
9108 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9109 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
9110 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
9111 ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14
9112 ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3
9113 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9114 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero
9115 ; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9116 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
9117 ; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4
9118 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9119 ; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3
9120 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9121 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
9122 ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4
9123 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9124 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
9125 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
9126 ; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm4
9127 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9128 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9129 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
9130 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9131 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2
9132 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0
9133 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
9134 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9135 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
9136 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9137 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9138 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9139 ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9140 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
9141 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
9142 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9143 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9144 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6],mem[7]
9145 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
9146 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
9147 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9148 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9149 ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1
9150 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9151 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9152 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
9153 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1
9154 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9155 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9156 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9157 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9158 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
9159 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
9160 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9161 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
9162 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
9163 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9164 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
9165 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
9166 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
9167 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4
9168 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9169 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload
9170 ; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7]
9171 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9172 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
9173 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
9174 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9175 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2
9176 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3
9177 ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8
9178 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
9179 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
9180 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9181 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9182 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9183 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0
9184 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9185 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
9186 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9187 ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9188 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9189 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9190 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9191 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9192 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
9193 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9194 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9195 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9196 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9197 ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3
9198 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9199 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
9200 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9201 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
9202 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1
9203 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3
9204 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9205 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9206 ; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
9207 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9208 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9209 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9210 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9211 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9212 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9213 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9214 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9215 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9216 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
9217 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9218 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9219 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5
9220 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9221 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3
9222 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
9223 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9224 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9225 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9226 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9227 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0
9228 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9229 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9230 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9231 ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9232 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9233 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9234 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9235 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9236 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7]
9237 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9238 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9239 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9240 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9241 ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3
9242 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9243 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
9244 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9245 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload
9246 ; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
9247 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
9248 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15
9249 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7]
9250 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9251 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9252 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9253 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9254 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9255 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9256 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9257 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9258 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9259 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
9260 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9261 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5
9262 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9263 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3
9264 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4
9265 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9266 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9267 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9268 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9269 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0
9270 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9271 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9272 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9273 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
9274 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9275 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9276 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9277 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9278 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7]
9279 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
9280 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
9281 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9282 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9283 ; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3
9284 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
9285 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9286 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9287 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
9288 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3
9289 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9290 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9291 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7]
9292 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
9293 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
9294 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9295 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9296 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4
9297 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9298 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9299 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9300 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9301 ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
9302 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
9303 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9304 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5
9305 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9306 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9307 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
9308 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4
9309 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
9310 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9311 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9312 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9313 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
9314 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9315 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9316 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm2
9317 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9318 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
9319 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9320 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3]
9321 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
9322 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9323 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3],mem[4,5,6,7]
9324 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
9325 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9326 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9327 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9328 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
9329 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9330 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9331 ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2]
9332 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9333 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7]
9334 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9335 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1]
9336 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7]
9337 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9338 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
9339 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4
9340 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9341 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3]
9342 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9343 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9344 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3]
9345 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1]
9346 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
9347 ; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3]
9348 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
9349 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9350 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
9351 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9352 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
9353 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5
9354 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
9355 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9356 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9357 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9358 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3
9359 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9360 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9361 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3
9362 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
9363 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
9364 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9365 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3]
9366 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
9367 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9368 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
9369 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
9370 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9371 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9372 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9373 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9374 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
9375 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9376 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
9377 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9378 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9379 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3]
9380 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1]
9381 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9382 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9383 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9384 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3]
9385 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9386 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
9387 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9388 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2]
9389 ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
9390 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7]
9391 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9392 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1]
9393 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
9394 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9395 ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1]
9396 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9397 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9398 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5
9399 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6
9400 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5
9401 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9402 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9403 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3
9404 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5
9405 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3
9406 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9407 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3
9408 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
9409 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3]
9410 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9411 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7]
9412 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
9413 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9414 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9415 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
9416 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
9417 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9418 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3]
9419 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9420 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3]
9421 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
9422 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
9423 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9424 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
9425 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
9426 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7]
9427 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2]
9428 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7]
9429 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9430 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1]
9431 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7]
9432 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9433 ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1]
9434 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
9435 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9436 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5
9437 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7
9438 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5
9439 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9440 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9441 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3
9442 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5
9443 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0
9444 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9445 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9446 ; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3
9447 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9448 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
9449 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9450 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3]
9451 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9452 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9453 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
9454 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
9455 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9456 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9457 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9458 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
9459 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7]
9460 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9461 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3]
9462 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
9463 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9464 ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3]
9465 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1]
9466 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9467 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9468 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9469 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3]
9470 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
9471 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7]
9472 ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9473 ; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2]
9474 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9475 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7]
9476 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9477 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1]
9478 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
9479 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9480 ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1]
9481 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9482 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7
9483 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14
9484 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7
9485 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
9486 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5
9487 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7
9488 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5
9489 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9490 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9491 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9492 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9493 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
9494 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7]
9495 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9496 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
9497 ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7]
9498 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7]
9499 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7]
9500 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7]
9501 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9502 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
9503 ; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
9504 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9505 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7]
9506 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9507 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
9508 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
9509 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
9510 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9511 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9512 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7]
9513 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
9514 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
9515 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7]
9516 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9517 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9518 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9519 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
9520 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
9521 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
9522 ; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9523 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
9524 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15
9525 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9526 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12
9527 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12
9528 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2
9529 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12
9530 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2
9531 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12
9532 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12
9533 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2
9534 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2
9535 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9536 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9537 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9538 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
9539 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
9540 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7]
9541 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
9542 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload
9543 ; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7]
9544 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9545 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9546 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7]
9547 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9548 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9549 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9550 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7]
9551 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9552 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9553 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
9554 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
9555 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9556 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload
9557 ; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7]
9558 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9559 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9560 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7]
9561 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9562 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9563 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9564 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9565 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9566 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9567 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9568 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14
9569 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9570 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15
9571 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9572 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9573 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4
9574 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14
9575 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4
9576 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
9577 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9578 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12
9579 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4
9580 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4
9581 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9582 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
9583 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
9584 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
9585 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
9586 ; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7]
9587 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9588 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9589 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7]
9590 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9591 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
9592 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
9593 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9594 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9595 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9596 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9597 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9598 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9599 ; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
9600 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9601 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9602 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7]
9603 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
9604 ; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9605 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9606 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9607 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
9608 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14
9609 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9610 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15
9611 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
9612 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
9613 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3
9614 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14
9615 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3
9616 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
9617 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9618 ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12
9619 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3
9620 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm0
9621 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9622 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9623 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9624 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
9625 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
9626 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7]
9627 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9628 ; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
9629 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
9630 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
9631 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
9632 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7]
9633 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9634 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9635 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9636 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7]
9637 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
9638 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
9639 ; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9640 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7]
9641 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
9642 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9643 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9644 ; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
9645 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
9646 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
9647 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7]
9648 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9649 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
9650 ; AVX1-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9651 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9652 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
9653 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9654 ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
9655 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5
9656 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9657 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14
9658 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
9659 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6
9660 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1
9661 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1
9662 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5
9663 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5
9664 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1
9665 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0
9666 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9667 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
9668 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
9669 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9670 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
9671 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9672 ; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
9673 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
9674 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7]
9675 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9676 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9677 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
9678 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9679 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1
9680 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9681 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3]
9682 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
9683 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5
9684 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9685 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9686 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
9687 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9688 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3]
9689 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9690 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9691 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9692 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
9693 ; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9694 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
9695 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
9696 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9697 ; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3]
9698 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
9699 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
9700 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9701 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2
9702 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5
9703 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0
9704 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9705 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9706 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9707 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9708 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7]
9709 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9710 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3]
9711 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9712 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9713 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
9714 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
9715 ; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9716 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1]
9717 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9718 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7]
9719 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9720 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5
9721 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9722 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3]
9723 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9724 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9725 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6
9726 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9727 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
9728 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
9729 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9730 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3]
9731 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
9732 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9733 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
9734 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9735 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
9736 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6
9737 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9738 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3]
9739 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9740 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
9741 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9742 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9743 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9744 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5
9745 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2
9746 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9747 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9748 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9749 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9750 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
9751 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9752 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
9753 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
9754 ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9755 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
9756 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
9757 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
9758 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9759 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
9760 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9761 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4
9762 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9763 ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3]
9764 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9765 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9766 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5
9767 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9768 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
9769 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
9770 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9771 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3]
9772 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
9773 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9774 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
9775 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9776 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9777 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
9778 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9779 ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3]
9780 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
9781 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
9782 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9783 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9784 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9785 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2
9786 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9787 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9788 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
9789 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9790 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
9791 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9792 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
9793 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
9794 ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9795 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
9796 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
9797 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
9798 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
9799 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9800 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9801 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3
9802 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9803 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
9804 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9805 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9806 ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4
9807 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9808 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9809 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
9810 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9811 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3]
9812 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
9813 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9814 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9815 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9816 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
9817 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
9818 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9819 ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3]
9820 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
9821 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
9822 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9823 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9824 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1
9825 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
9826 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9827 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1
9828 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9829 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9830 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9831 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
9832 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
9833 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
9834 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
9835 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9836 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9837 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
9838 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
9839 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
9840 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9841 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9842 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
9843 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9844 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9845 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9846 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
9847 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9848 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9849 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
9850 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
9851 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9852 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
9853 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
9854 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
9855 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
9856 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9857 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9858 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
9859 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9860 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1
9861 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9862 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
9863 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9864 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9865 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1
9866 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9867 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9868 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9869 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
9870 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9871 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9872 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
9873 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
9874 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9875 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9876 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
9877 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
9878 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
9879 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
9880 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9881 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2
9882 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9883 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9884 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9885 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9886 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
9887 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9888 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9889 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
9890 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9891 ; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9892 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7]
9893 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9894 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
9895 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9896 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9897 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
9898 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
9899 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9900 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9901 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9902 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
9903 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9904 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1
9905 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2
9906 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1
9907 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9908 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9909 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2
9910 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9911 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9912 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9913 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9914 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9915 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9916 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
9917 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
9918 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9919 ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9920 ; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7]
9921 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
9922 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
9923 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
9924 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9925 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3
9926 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9927 ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9928 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9929 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9930 ; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9931 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
9932 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
9933 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
9934 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9935 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9936 ; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7]
9937 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
9938 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
9939 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9940 ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
9941 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
9942 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
9943 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9944 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9945 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9946 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
9947 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9948 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
9949 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3
9950 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1
9951 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9952 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9953 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
9954 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
9955 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9956 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4
9957 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9958 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9959 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9960 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
9961 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9962 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9963 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7]
9964 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
9965 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
9966 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
9967 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9968 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4
9969 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9970 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
9971 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9972 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9973 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9974 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
9975 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
9976 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
9977 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9978 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9979 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7]
9980 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
9981 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7]
9982 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
9983 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9984 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
9985 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
9986 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9987 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9988 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
9989 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
9990 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9991 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3
9992 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4
9993 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3
9994 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9995 ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
9996 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
9997 ; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
9998 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9999 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
10000 ; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
10001 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
10002 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10003 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
10004 ; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7]
10005 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
10006 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
10007 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
10008 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10009 ; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1]
10010 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10011 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10012 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
10013 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10014 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
10015 ; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
10016 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
10017 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
10018 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
10019 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10020 ; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
10021 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10022 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10023 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10024 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3]
10025 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
10026 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
10027 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10028 ; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3]
10029 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
10030 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
10031 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
10032 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10033 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4
10034 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5
10035 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
10036 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
10037 ; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero
10038 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
10039 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
10040 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7]
10041 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
10042 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
10043 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
10044 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
10045 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10046 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10047 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10048 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
10049 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
10050 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
10051 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
10052 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
10053 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
10054 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10055 ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3]
10056 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
10057 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
10058 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3]
10059 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
10060 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7]
10061 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
10062 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5
10063 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6
10064 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5
10065 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10066 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
10067 ; AVX1-ONLY-NEXT: # xmm6 = zero,xmm0[1],mem[0],zero
10068 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10069 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
10070 ; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3]
10071 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
10072 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10073 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
10074 ; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
10075 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
10076 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
10077 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
10078 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10079 ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1]
10080 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10081 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10082 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
10083 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10084 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10085 ; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10086 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
10087 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
10088 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
10089 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10090 ; AVX1-ONLY-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
10091 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10092 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
10093 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10094 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3]
10095 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
10096 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
10097 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10098 ; AVX1-ONLY-NEXT: # xmm9 = mem[3,3,3,3]
10099 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
10100 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7]
10101 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
10102 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6
10103 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7
10104 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6
10105 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10106 ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
10107 ; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero
10108 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10109 ; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10110 ; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
10111 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
10112 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10113 ; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10114 ; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
10115 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
10116 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
10117 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
10118 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10119 ; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1]
10120 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10121 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10122 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
10123 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10124 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
10125 ; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10126 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
10127 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
10128 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
10129 ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10130 ; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
10131 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10132 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
10133 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10134 ; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3]
10135 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
10136 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7]
10137 ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10138 ; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3]
10139 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
10140 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
10141 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
10142 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7
10143 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0
10144 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0
10145 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10146 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi)
10147 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10148 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi)
10149 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10150 ; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi)
10151 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10152 ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi)
10153 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10154 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx)
10155 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10156 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
10157 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10158 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx)
10159 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10160 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
10161 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10162 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx)
10163 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10164 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx)
10165 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10166 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx)
10167 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10168 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx)
10169 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10170 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8)
10171 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10172 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
10173 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10174 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8)
10175 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10176 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8)
10177 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10178 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9)
10179 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10180 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9)
10181 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10182 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9)
10183 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10184 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9)
10185 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
10186 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10187 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax)
10188 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10189 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax)
10190 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10191 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax)
10192 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10193 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax)
10194 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
10195 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax)
10196 ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax)
10197 ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax)
10198 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax)
10199 ; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608
10200 ; AVX1-ONLY-NEXT: vzeroupper
10201 ; AVX1-ONLY-NEXT: retq
10203 ; AVX2-SLOW-LABEL: load_i16_stride7_vf64:
10204 ; AVX2-SLOW: # %bb.0:
10205 ; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8
10206 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13
10207 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14
10208 ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15
10209 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12
10210 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9
10211 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10
10212 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3
10213 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10214 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
10215 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10216 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0
10217 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10218 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
10219 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10220 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
10221 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
10222 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
10223 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10224 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
10225 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10226 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
10227 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
10228 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4
10229 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10230 ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
10231 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10232 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7]
10233 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10234 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10235 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10236 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10237 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10238 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
10239 ; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10240 ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10241 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
10242 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10243 ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0
10244 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8
10245 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
10246 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
10247 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10248 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10249 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10250 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10251 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10252 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7
10253 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1
10254 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
10255 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10256 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10257 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
10258 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10259 ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0
10260 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10261 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0
10262 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10263 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8
10264 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7]
10265 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10266 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
10267 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10268 ; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11
10269 ; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5
10270 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7]
10271 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0
10272 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
10273 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10274 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10275 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
10276 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10277 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7]
10278 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
10279 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
10280 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
10281 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10282 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
10283 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
10284 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5
10285 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
10286 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
10287 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
10288 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9
10289 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10290 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
10291 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10292 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10293 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
10294 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10295 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
10296 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
10297 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
10298 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4
10299 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10300 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15
10301 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
10302 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14
10303 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10304 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10305 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10306 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10307 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
10308 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13
10309 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10310 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10311 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
10312 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
10313 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
10314 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7
10315 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0
10316 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10317 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10318 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10319 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7]
10320 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
10321 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
10322 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
10323 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10324 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10325 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
10326 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10327 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
10328 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2
10329 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0
10330 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10331 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
10332 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
10333 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
10334 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7]
10335 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10
10336 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
10337 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
10338 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
10339 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5
10340 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
10341 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10342 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0
10343 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10344 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10345 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10346 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7]
10347 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10348 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10349 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10350 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10351 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
10352 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1]
10353 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
10354 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10355 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10356 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
10357 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
10358 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10359 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
10360 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10361 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10362 ; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10363 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
10364 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1]
10365 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
10366 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10367 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10368 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
10369 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10370 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10371 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10372 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
10373 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10374 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
10375 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10376 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10377 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10378 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
10379 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
10380 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
10381 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10382 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5
10383 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
10384 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10385 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
10386 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
10387 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
10388 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10389 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10390 ; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
10391 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
10392 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
10393 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10394 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10395 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10396 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
10397 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10398 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
10399 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10400 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10401 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10402 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10403 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
10404 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10405 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10406 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
10407 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10408 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
10409 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10410 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10411 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10412 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10413 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
10414 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10415 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4
10416 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
10417 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10418 ; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
10419 ; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
10420 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
10421 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
10422 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10423 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
10424 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7]
10425 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
10426 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10427 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
10428 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
10429 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0
10430 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10431 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0
10432 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2
10433 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
10434 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10
10435 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12
10436 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7]
10437 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
10438 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
10439 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
10440 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
10441 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10442 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0
10443 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10444 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2]
10445 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5]
10446 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10447 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
10448 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
10449 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
10450 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10451 ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13
10452 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11
10453 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
10454 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
10455 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
10456 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
10457 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
10458 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
10459 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10460 ; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1
10461 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10462 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2]
10463 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5]
10464 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10465 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
10466 ; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
10467 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10468 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10469 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5
10470 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6
10471 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
10472 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10473 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10474 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7]
10475 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
10476 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
10477 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
10478 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
10479 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
10480 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14
10481 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2]
10482 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5]
10483 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10484 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7]
10485 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10486 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
10487 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10488 ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7
10489 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1
10490 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7]
10491 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9
10492 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7]
10493 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
10494 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10495 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
10496 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
10497 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10498 ; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2
10499 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2]
10500 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5]
10501 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10502 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
10503 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10504 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
10505 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10506 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7]
10507 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15
10508 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7]
10509 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
10510 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10511 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10512 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7]
10513 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10514 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
10515 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10516 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
10517 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10518 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10519 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
10520 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
10521 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
10522 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10523 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10524 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
10525 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10526 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
10527 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10528 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
10529 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10530 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10531 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7]
10532 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
10533 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
10534 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10535 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10536 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
10537 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10538 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10539 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10540 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10541 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10542 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10543 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
10544 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10545 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
10546 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
10547 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10548 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7]
10549 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
10550 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10551 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10552 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10553 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10554 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10555 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
10556 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10557 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
10558 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
10559 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10560 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10561 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10562 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2]
10563 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10564 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10565 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10566 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10567 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10568 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10569 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10570 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6
10571 ; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4
10572 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
10573 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10574 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10575 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10576 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10577 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10578 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2]
10579 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10580 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10581 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10582 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10583 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10584 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10585 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10586 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7]
10587 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5
10588 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10589 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
10590 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10591 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10592 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
10593 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10594 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2]
10595 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
10596 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10597 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
10598 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10599 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
10600 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
10601 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10602 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10603 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10604 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
10605 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
10606 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
10607 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
10608 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10609 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2]
10610 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
10611 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
10612 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10613 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10614 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10615 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10616 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10617 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
10618 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10619 ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10620 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10621 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10622 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10623 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10624 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10625 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3]
10626 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10627 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10628 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10629 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10630 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10631 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10632 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10633 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10634 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
10635 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8
10636 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10637 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10638 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10639 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10640 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10641 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10642 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10643 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3]
10644 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4]
10645 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10646 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10647 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10648 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
10649 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10650 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10651 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
10652 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10653 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
10654 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10655 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10656 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10657 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3]
10658 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
10659 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10660 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
10661 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10662 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
10663 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10664 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10665 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
10666 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
10667 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
10668 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
10669 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
10670 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
10671 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3]
10672 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
10673 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
10674 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
10675 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10676 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
10677 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
10678 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10679 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10680 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10681 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
10682 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
10683 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
10684 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
10685 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
10686 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10687 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10688 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10689 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
10690 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
10691 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5
10692 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
10693 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15]
10694 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
10695 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
10696 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
10697 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7]
10698 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
10699 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
10700 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10701 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5]
10702 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10703 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
10704 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
10705 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10706 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10707 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
10708 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10709 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
10710 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
10711 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
10712 ; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10713 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5]
10714 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10715 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
10716 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10717 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
10718 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
10719 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10720 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
10721 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
10722 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
10723 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
10724 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10725 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10726 ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7]
10727 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7
10728 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
10729 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
10730 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
10731 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
10732 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
10733 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10734 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10735 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
10736 ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
10737 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10738 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
10739 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
10740 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
10741 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
10742 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10743 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10744 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
10745 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6
10746 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
10747 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
10748 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
10749 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10750 ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8
10751 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
10752 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
10753 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
10754 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
10755 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
10756 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5]
10757 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10758 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
10759 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
10760 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
10761 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10762 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10763 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10764 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10765 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4
10766 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
10767 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
10768 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
10769 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
10770 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10771 ; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
10772 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7]
10773 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
10774 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
10775 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15]
10776 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
10777 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10778 ; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload
10779 ; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
10780 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
10781 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
10782 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10783 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
10784 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
10785 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
10786 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
10787 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
10788 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10789 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10790 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
10791 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10792 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10793 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10794 ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10795 ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
10796 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
10797 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
10798 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
10799 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10800 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10801 ; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13
10802 ; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14
10803 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
10804 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
10805 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
10806 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10807 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
10808 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10809 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10810 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
10811 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
10812 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
10813 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
10814 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
10815 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
10816 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
10817 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
10818 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10819 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
10820 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
10821 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10822 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10823 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
10824 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10825 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10826 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
10827 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4
10828 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
10829 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10830 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10831 ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4
10832 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10833 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5
10834 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10835 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
10836 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
10837 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
10838 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
10839 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15]
10840 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
10841 ; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
10842 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10843 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
10844 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
10845 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
10846 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
10847 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
10848 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
10849 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
10850 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
10851 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10852 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10853 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10854 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
10855 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
10856 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
10857 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
10858 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
10859 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
10860 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
10861 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
10862 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
10863 ; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1
10864 ; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0
10865 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
10866 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
10867 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
10868 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10869 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
10870 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10871 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
10872 ; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
10873 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15
10874 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
10875 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
10876 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
10877 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
10878 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
10879 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5
10880 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
10881 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
10882 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
10883 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10884 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10885 ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
10886 ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
10887 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
10888 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
10889 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3
10890 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
10891 ; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15
10892 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3
10893 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
10894 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
10895 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
10896 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10897 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
10898 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10899 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
10900 ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
10901 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
10902 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15]
10903 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2
10904 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10905 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10906 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
10907 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
10908 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
10909 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
10910 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
10911 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
10912 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10913 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
10914 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
10915 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
10916 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10917 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10918 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10919 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
10920 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
10921 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15]
10922 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10923 ; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10924 ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
10925 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5
10926 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
10927 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
10928 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
10929 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
10930 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10931 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
10932 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
10933 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10934 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
10935 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10936 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
10937 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
10938 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7]
10939 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
10940 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10941 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
10942 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
10943 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
10944 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6
10945 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15]
10946 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
10947 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
10948 ; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
10949 ; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7]
10950 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
10951 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15]
10952 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
10953 ; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
10954 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
10955 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
10956 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
10957 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1
10958 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10959 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
10960 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10961 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10962 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
10963 ; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
10964 ; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
10965 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7]
10966 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
10967 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
10968 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
10969 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
10970 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7
10971 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
10972 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
10973 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10974 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10975 ; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
10976 ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7]
10977 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
10978 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15]
10979 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10980 ; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
10981 ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
10982 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9
10983 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
10984 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10985 ; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10986 ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
10987 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7
10988 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10989 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
10990 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
10991 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
10992 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15]
10993 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10994 ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
10995 ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
10996 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7]
10997 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
10998 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
10999 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
11000 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
11001 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
11002 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
11003 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
11004 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11005 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11006 ; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11007 ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
11008 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7
11009 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
11010 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11011 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
11012 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11013 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
11014 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
11015 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11016 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
11017 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11018 ; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11019 ; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7]
11020 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
11021 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
11022 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
11023 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
11024 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
11025 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4
11026 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
11027 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
11028 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
11029 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
11030 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
11031 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11032 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11033 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi)
11034 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11035 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi)
11036 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11037 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi)
11038 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11039 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi)
11040 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11041 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx)
11042 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11043 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx)
11044 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11045 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx)
11046 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11047 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx)
11048 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11049 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx)
11050 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11051 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx)
11052 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11053 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx)
11054 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11055 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx)
11056 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11057 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8)
11058 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11059 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8)
11060 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11061 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8)
11062 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11063 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8)
11064 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11065 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9)
11066 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11067 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9)
11068 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11069 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9)
11070 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11071 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9)
11072 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11073 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11074 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax)
11075 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11076 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax)
11077 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11078 ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax)
11079 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11080 ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax)
11081 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11082 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax)
11083 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax)
11084 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax)
11085 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax)
11086 ; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8
11087 ; AVX2-SLOW-NEXT: vzeroupper
11088 ; AVX2-SLOW-NEXT: retq
11090 ; AVX2-FAST-LABEL: load_i16_stride7_vf64:
11091 ; AVX2-FAST: # %bb.0:
11092 ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608
11093 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6
11094 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7
11095 ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14
11096 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15
11097 ; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11098 ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12
11099 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11100 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13
11101 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11102 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2
11103 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9
11104 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
11105 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0
11106 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11107 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
11108 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11
11109 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
11110 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
11111 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
11112 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
11113 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
11114 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11115 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
11116 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
11117 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2
11118 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11119 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
11120 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11121 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
11122 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
11123 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
11124 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
11125 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
11126 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11127 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
11128 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11129 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0
11130 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11131 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
11132 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15
11133 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13
11135 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11136 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
11137 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
11138 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3
11139 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1
11140 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12
11141 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7]
11142 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11143 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11144 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
11145 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6
11146 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0
11147 ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
11148 ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2
11149 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11150 ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0
11151 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11152 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
11153 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11154 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
11155 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5
11156 ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2
11157 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0
11158 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11159 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
11160 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3
11161 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11162 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
11163 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4
11164 ; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0
11165 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2
11166 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11167 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2
11168 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11169 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11170 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
11171 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11172 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11173 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11174 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11175 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
11176 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u>
11177 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6
11178 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
11179 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8
11180 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
11181 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11182 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11183 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11184 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11185 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11186 ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7]
11187 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11188 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11189 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11190 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
11191 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
11192 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11193 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11194 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11195 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14
11196 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11197 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
11198 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11199 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11200 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11201 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
11202 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11203 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11204 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
11205 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11206 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11207 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11208 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
11209 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11210 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
11211 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6
11212 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11213 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
11214 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4
11215 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
11216 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3
11217 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11218 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7]
11219 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11220 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
11221 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
11222 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u>
11223 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6
11224 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
11225 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8
11226 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
11227 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11228 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2
11229 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11230 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11231 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11232 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7]
11233 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11234 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11235 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11236 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11237 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11238 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
11239 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11240 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11241 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1
11242 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11243 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1
11244 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
11245 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11246 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11247 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11248 ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11
11249 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11250 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
11251 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8
11252 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8
11253 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7
11254 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11255 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11256 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11257 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7]
11258 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
11259 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
11260 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6
11261 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11262 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11263 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
11264 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4
11265 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
11266 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11267 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
11268 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11269 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11270 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11271 ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
11272 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11273 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
11274 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11275 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
11276 ; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
11277 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3]
11278 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
11279 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7
11280 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
11281 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11282 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
11283 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11284 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
11285 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11286 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11287 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11288 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
11289 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
11290 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7
11291 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3
11292 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11293 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7]
11294 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11295 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11296 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6
11297 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
11298 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
11299 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7
11300 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2
11301 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11302 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
11303 ; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
11304 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
11305 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
11306 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5
11307 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7]
11308 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3]
11309 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4
11310 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0
11311 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11312 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10
11313 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1
11314 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7]
11315 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11
11316 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5]
11317 ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1]
11318 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4
11319 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
11320 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6
11321 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
11322 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11323 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
11324 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
11325 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7
11326 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14
11327 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
11328 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
11329 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7]
11330 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11331 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12
11332 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7
11333 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7]
11334 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0
11335 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8
11336 ; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0
11337 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11338 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
11339 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2
11340 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7]
11341 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
11342 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
11343 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11344 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0
11345 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6
11346 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
11347 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15
11348 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11349 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2
11350 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8
11351 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0
11352 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11353 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
11354 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9
11355 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13
11356 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
11357 ; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload
11358 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7]
11359 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11360 ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0
11361 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8
11362 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7]
11363 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9
11364 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5
11365 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5
11366 ; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0
11367 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11368 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
11369 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14
11370 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7]
11371 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
11372 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7]
11373 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11374 ; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13
11375 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
11376 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14
11377 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7]
11378 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
11379 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5
11380 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
11381 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
11382 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
11383 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
11384 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11385 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
11386 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
11387 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11388 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7]
11389 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11390 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
11391 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4
11392 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11393 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
11394 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
11395 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11396 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15]
11397 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
11398 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11399 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7]
11400 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
11401 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
11402 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
11403 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11404 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
11405 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11406 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11407 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11408 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11409 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11410 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15
11411 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
11412 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11413 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
11414 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
11415 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0
11416 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11417 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11418 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11419 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11420 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11421 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11422 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
11423 ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14
11424 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11425 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11426 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
11427 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
11428 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
11429 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11430 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11431 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
11432 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11433 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3
11434 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11435 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11436 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11437 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11438 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11439 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11440 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5
11441 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10
11442 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
11443 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11444 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11445 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11446 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11447 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11448 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm3
11449 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11450 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11451 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11452 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11453 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11454 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11455 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
11456 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11457 ; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
11458 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11459 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11460 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11461 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11462 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11463 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3
11464 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11465 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11466 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11467 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11468 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11469 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11470 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11471 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11472 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
11473 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
11474 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
11475 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11476 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11477 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11478 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1
11479 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11480 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11481 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11482 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11483 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11484 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11485 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
11486 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
11487 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
11488 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
11489 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11490 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
11491 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
11492 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11493 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
11494 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2
11495 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11496 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11497 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11498 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11499 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11500 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
11501 ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14
11502 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11503 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0
11504 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11505 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11506 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
11507 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11508 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11509 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3]
11510 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5
11511 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
11512 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11513 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
11514 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
11515 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11516 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
11517 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
11518 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
11519 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11520 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11521 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3]
11522 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11523 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5
11524 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
11525 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11526 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
11527 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
11528 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11529 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7]
11530 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
11531 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
11532 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11533 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11534 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3]
11535 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11536 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11537 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11538 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11539 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
11540 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11541 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11542 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11543 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11544 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
11545 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11546 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3
11547 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10
11548 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
11549 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
11550 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
11551 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
11552 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
11553 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7]
11554 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
11555 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4
11556 ; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1
11557 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11558 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
11559 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
11560 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7]
11561 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11562 ; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
11563 ; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
11564 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u>
11565 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5
11566 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
11567 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11568 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
11569 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
11570 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
11571 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11572 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11573 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11574 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7]
11575 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4
11576 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3
11577 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
11578 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11579 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11580 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11581 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
11582 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
11583 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
11584 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4
11585 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10
11586 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11587 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11588 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0
11589 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
11590 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11591 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11592 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
11593 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4
11594 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11595 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
11596 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
11597 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11598 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11599 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11600 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11601 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11602 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11603 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
11604 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
11605 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
11606 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
11607 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11608 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
11609 ; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
11610 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11611 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11612 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3
11613 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11614 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11615 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4
11616 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
11617 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11618 ; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
11619 ; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7]
11620 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4
11621 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11622 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
11623 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
11624 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
11625 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
11626 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11627 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11628 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11629 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
11630 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3
11631 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
11632 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
11633 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
11634 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11635 ; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11636 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7]
11637 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2
11638 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11639 ; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
11640 ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
11641 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11642 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11643 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
11644 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11645 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3
11646 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11647 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11648 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2
11649 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
11650 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11651 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11652 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11653 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7]
11654 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11655 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
11656 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
11657 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4]
11658 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
11659 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
11660 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
11661 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
11662 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
11663 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
11664 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
11665 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11666 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
11667 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11668 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
11669 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u>
11670 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0
11671 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5
11672 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
11673 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15]
11674 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
11675 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
11676 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11677 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11678 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
11679 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0
11680 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11681 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
11682 ; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
11683 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12
11684 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
11685 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
11686 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
11687 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11688 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload
11689 ; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
11690 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
11691 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13
11692 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6
11693 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
11694 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
11695 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
11696 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11697 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11698 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
11699 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4
11700 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13
11701 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
11702 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13
11703 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
11704 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11705 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
11706 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11707 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
11708 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0
11709 ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
11710 ; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7]
11711 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12
11712 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
11713 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
11714 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11715 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
11716 ; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
11717 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
11718 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13
11719 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12
11720 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
11721 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
11722 ; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
11723 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13
11724 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2
11725 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13
11726 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
11727 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11728 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
11729 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11730 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11731 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0
11732 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
11733 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11734 ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
11735 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
11736 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3
11737 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
11738 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
11739 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11740 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11741 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
11742 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2
11743 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1
11744 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11745 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11746 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
11747 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
11748 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
11749 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
11750 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11751 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11752 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
11753 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11754 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11755 ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
11756 ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
11757 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
11758 ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
11759 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
11760 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
11761 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5
11762 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
11763 ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1]
11764 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
11765 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
11766 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15]
11767 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
11768 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11769 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
11770 ; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7]
11771 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10
11772 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
11773 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10
11774 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
11775 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
11776 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u>
11777 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6
11778 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
11779 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6
11780 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
11781 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
11782 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
11783 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
11784 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7
11785 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7
11786 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6
11787 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
11788 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6
11789 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
11790 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7]
11791 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
11792 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
11793 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9
11794 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
11795 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
11796 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7
11797 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7
11798 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
11799 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
11800 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
11801 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11802 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
11803 ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
11804 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
11805 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8
11806 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7
11807 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7
11808 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
11809 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11810 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11811 ; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7]
11812 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11813 ; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
11814 ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
11815 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11
11816 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11
11817 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
11818 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
11819 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8
11820 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8
11821 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
11822 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
11823 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
11824 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11825 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
11826 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11827 ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11828 ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
11829 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
11830 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2
11831 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
11832 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11833 ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11834 ; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7]
11835 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2
11836 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2
11837 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11838 ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
11839 ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7]
11840 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
11841 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11842 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
11843 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
11844 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
11845 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
11846 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
11847 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11848 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi)
11849 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11850 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi)
11851 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11852 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi)
11853 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11854 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi)
11855 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11856 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx)
11857 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11858 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx)
11859 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11860 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx)
11861 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11862 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx)
11863 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11864 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx)
11865 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11866 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx)
11867 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11868 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx)
11869 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11870 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx)
11871 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11872 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8)
11873 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11874 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8)
11875 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11876 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8)
11877 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11878 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8)
11879 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11880 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9)
11881 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11882 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9)
11883 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11884 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9)
11885 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11886 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9)
11887 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
11888 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11889 ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax)
11890 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11891 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax)
11892 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11893 ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax)
11894 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11895 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax)
11896 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
11897 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax)
11898 ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax)
11899 ; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax)
11900 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax)
11901 ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608
11902 ; AVX2-FAST-NEXT: vzeroupper
11903 ; AVX2-FAST-NEXT: retq
11905 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64:
11906 ; AVX2-FAST-PERLANE: # %bb.0:
11907 ; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8
11908 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13
11909 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15
11910 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11
11911 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14
11912 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9
11913 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12
11914 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3
11915 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11916 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4
11917 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11918 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1
11919 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
11920 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0
11921 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11922 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
11923 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
11924 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
11925 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
11926 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
11927 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
11928 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
11929 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
11930 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4
11931 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11932 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
11933 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11934 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7]
11935 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11936 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11937 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11938 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11939 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
11940 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11941 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11942 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11943 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11944 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
11945 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11946 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7]
11947 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11948 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11949 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11950 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11951 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11952 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8
11953 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7
11954 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
11955 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11956 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11957 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11958 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11959 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
11960 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11961 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0
11962 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11963 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10
11964 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7]
11965 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
11966 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11967 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11968 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6
11969 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1
11970 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
11971 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11972 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
11973 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
11974 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
11975 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
11976 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11977 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
11978 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
11979 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
11980 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7]
11981 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
11982 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
11983 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
11984 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11985 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
11986 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11987 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
11988 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9
11989 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11990 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
11991 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
11992 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
11993 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
11994 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
11995 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11996 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
11997 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11998 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
11999 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12000 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
12001 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8
12002 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
12003 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
12004 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12005 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7]
12006 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14
12007 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12008 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
12009 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
12010 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12011 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4
12012 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6
12013 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
12014 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12015 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12016 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
12017 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
12018 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
12019 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
12020 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12021 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12022 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12023 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
12024 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
12025 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
12026 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2
12027 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
12028 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12029 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11
12030 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
12031 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
12032 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
12033 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
12034 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10
12035 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
12036 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
12037 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
12038 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12039 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
12040 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4
12041 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
12042 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12043 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12044 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12045 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
12046 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12047 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12048 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12049 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12050 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7]
12051 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
12052 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
12053 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12054 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12055 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
12056 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12057 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7
12058 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
12059 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12060 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12061 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12062 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12063 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
12064 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1]
12065 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
12066 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12067 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12068 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8
12069 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
12070 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12071 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12072 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12073 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7]
12074 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12075 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12076 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12077 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12078 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12079 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7]
12080 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1]
12081 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15]
12082 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12083 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
12084 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12085 ; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
12086 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7]
12087 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12088 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
12089 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12090 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
12091 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
12092 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3]
12093 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
12094 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4
12095 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
12096 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12097 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12098 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12099 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
12100 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12101 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12102 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12103 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
12104 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
12105 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12106 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12107 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12108 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7]
12109 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12110 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12111 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12112 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
12113 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
12114 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12115 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12116 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12117 ; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
12118 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
12119 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4
12120 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12121 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12122 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
12123 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
12124 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
12125 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12126 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
12127 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12128 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9
12129 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0
12130 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12131 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
12132 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12133 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
12134 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12135 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12136 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12137 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12138 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1
12139 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12140 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2]
12141 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
12142 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1
12143 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12144 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12145 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12146 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12147 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8
12148 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7
12149 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
12150 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12151 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12152 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12153 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12154 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12155 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1
12156 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12157 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2]
12158 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1
12159 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11
12160 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12161 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12162 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12163 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12164 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3
12165 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6
12166 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
12167 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12168 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12169 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12170 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12171 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12172 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12173 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
12174 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0
12175 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12176 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
12177 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13
12178 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
12179 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
12180 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7]
12181 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12182 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4
12183 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10
12184 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
12185 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12
12186 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13
12187 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10
12188 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2
12189 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12190 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
12191 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12192 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14
12193 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2]
12194 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12195 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11
12196 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
12197 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12198 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12199 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12200 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
12201 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11
12202 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
12203 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
12204 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12205 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12206 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
12207 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5
12208 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
12209 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12210 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
12211 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12212 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12213 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7]
12214 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5
12215 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
12216 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12217 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12218 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1
12219 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12220 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12221 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12222 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12223 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12224 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7]
12225 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12226 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12227 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12228 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12229 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1
12230 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12231 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12232 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12233 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12234 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12235 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12236 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
12237 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12238 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12239 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12240 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1
12241 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12242 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12243 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12244 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12245 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12246 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12247 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
12248 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6
12249 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12250 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
12251 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
12252 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12253 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12254 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12255 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2]
12256 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
12257 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12258 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12259 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12260 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12261 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12262 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12263 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
12264 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12265 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12266 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12267 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12268 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12269 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2]
12270 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12271 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12272 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12273 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12274 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12275 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12276 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12277 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12278 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
12279 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12280 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12281 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12282 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12283 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2]
12284 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12285 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12286 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12287 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12288 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12289 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12290 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12291 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12292 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7]
12293 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
12294 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12295 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12296 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12297 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2]
12298 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1
12299 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12300 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12301 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12302 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12303 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12304 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12305 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7]
12306 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14
12307 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12308 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12309 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
12310 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
12311 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12312 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12313 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3]
12314 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12315 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
12316 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5
12317 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
12318 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12319 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
12320 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
12321 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12322 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
12323 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12324 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11
12325 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5
12327 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
12328 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12329 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12330 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3]
12331 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6
12332 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
12333 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12334 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12335 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
12336 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12337 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7]
12338 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6
12339 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7]
12340 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12341 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
12342 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3]
12343 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7
12344 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12345 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12346 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
12347 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
12348 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12349 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
12350 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
12351 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
12352 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0
12353 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
12354 ; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12355 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3]
12356 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2
12357 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
12358 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12359 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15]
12360 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
12361 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12362 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12363 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12364 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
12365 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
12366 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4
12367 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12
12368 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2
12369 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
12370 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
12371 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12372 ; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
12373 ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7]
12374 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
12375 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12376 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12377 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12378 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
12379 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7]
12380 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8
12381 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
12382 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12383 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12384 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12385 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
12386 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5
12387 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7]
12388 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15]
12389 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
12390 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12391 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12392 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12393 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
12394 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6
12395 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5
12396 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
12397 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12398 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
12399 ; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12400 ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7]
12401 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12402 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12403 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12404 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
12405 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7]
12406 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8
12407 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
12408 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6
12409 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12410 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12411 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3
12412 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8
12413 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
12414 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15]
12415 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
12416 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12417 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12418 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12419 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
12420 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5
12421 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14
12422 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3
12423 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
12424 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12425 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12426 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12427 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
12428 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6
12429 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
12430 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
12431 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
12432 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12433 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12434 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
12435 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7
12436 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
12437 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5
12438 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12439 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1
12440 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
12441 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12442 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
12443 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12444 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12445 ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12446 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
12447 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3
12448 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1
12449 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
12450 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
12451 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12452 ; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
12453 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7]
12454 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12455 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
12456 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
12457 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
12458 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12459 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12460 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
12461 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5
12462 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
12463 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12464 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0
12465 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12466 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
12467 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12468 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12469 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12470 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
12471 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
12472 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12473 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12474 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12475 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12476 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12477 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2
12478 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
12479 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
12480 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
12481 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12482 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3
12483 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12484 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2
12485 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12486 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7]
12487 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
12488 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2
12489 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12490 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
12491 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12492 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12493 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
12494 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4
12495 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
12496 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8
12497 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
12498 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12499 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
12500 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
12501 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12502 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12503 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
12504 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12505 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12506 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
12507 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12508 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12509 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
12510 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8
12511 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7]
12512 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1
12513 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12514 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9
12515 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12516 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8
12517 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12518 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7]
12519 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8
12520 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12521 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
12522 ; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
12523 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
12524 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
12525 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9
12526 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11
12527 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12528 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
12529 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12530 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12531 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
12532 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12533 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12534 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12535 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
12536 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12537 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12538 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
12539 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
12540 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
12541 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0
12542 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
12543 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1
12544 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0
12545 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
12546 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15
12547 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
12548 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15]
12549 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12550 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
12551 ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7]
12552 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13
12553 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13
12554 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15
12555 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
12556 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8
12557 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
12558 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3]
12559 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7]
12560 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12561 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12562 ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
12563 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7]
12564 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
12565 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7]
12566 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8
12567 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14
12568 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5
12569 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7]
12570 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3
12571 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12572 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12573 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15]
12574 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
12575 ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
12576 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
12577 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
12578 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15]
12579 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2
12580 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12581 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12582 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
12583 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13
12584 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10
12585 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4
12586 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
12587 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
12588 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
12589 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12590 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12591 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12592 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
12593 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
12594 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
12595 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12596 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
12597 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7]
12598 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
12599 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7]
12600 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12601 ; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
12602 ; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
12603 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
12604 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12605 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12606 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
12607 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7
12608 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12609 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
12610 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12611 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
12612 ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7]
12613 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9
12614 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
12615 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9
12616 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
12617 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
12618 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
12619 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2
12620 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
12621 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
12622 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
12623 ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
12624 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7]
12625 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
12626 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15]
12627 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12628 ; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
12629 ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
12630 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
12631 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7]
12632 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
12633 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1
12634 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12635 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0
12636 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12637 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
12638 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12639 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12640 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7]
12641 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9
12642 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9
12643 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
12644 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
12645 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8
12646 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
12647 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
12648 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12649 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12650 ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12651 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7]
12652 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
12653 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15]
12654 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12655 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12656 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
12657 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
12658 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
12659 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12660 ; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
12661 ; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7]
12662 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12663 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12664 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9
12665 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
12666 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
12667 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12668 ; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
12669 ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
12670 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
12671 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12
12672 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
12673 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
12674 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1
12675 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15]
12676 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
12677 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
12678 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12679 ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
12680 ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
12681 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
12682 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
12683 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4
12684 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7]
12685 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5
12686 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
12687 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12688 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
12689 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
12690 ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12691 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7]
12692 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
12693 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15]
12694 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5
12695 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
12696 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
12697 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3
12698 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7]
12699 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
12700 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
12701 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
12702 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12703 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12704 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi)
12705 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12706 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi)
12707 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12708 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi)
12709 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12710 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi)
12711 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12712 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx)
12713 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12714 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx)
12715 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12716 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx)
12717 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12718 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx)
12719 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12720 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx)
12721 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12722 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx)
12723 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12724 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx)
12725 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12726 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx)
12727 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12728 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8)
12729 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12730 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8)
12731 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12732 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8)
12733 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12734 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8)
12735 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12736 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9)
12737 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12738 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9)
12739 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12740 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9)
12741 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12742 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9)
12743 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12744 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12745 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax)
12746 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12747 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax)
12748 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12749 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax)
12750 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax)
12751 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12752 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax)
12753 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax)
12754 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax)
12755 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax)
12756 ; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8
12757 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
12758 ; AVX2-FAST-PERLANE-NEXT: retq
12760 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64:
12761 ; AVX512F-ONLY-SLOW: # %bb.0:
12762 ; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748
12763 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1
12764 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2
12765 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
12766 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16
12767 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18
12768 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
12769 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12770 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
12771 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
12772 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3
12773 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4
12774 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
12775 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm13
12776 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10
12777 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
12778 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
12779 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
12780 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2
12781 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12782 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2
12783 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4
12784 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
12785 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21
12786 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
12787 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
12788 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12789 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4
12790 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
12791 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6
12792 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7
12793 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
12794 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
12795 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1
12796 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7]
12797 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8
12798 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
12799 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
12800 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0
12801 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
12802 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12803 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9
12804 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11
12805 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19
12806 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm19[0,1,0,2]
12807 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0
12808 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15
12809 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3]
12810 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
12811 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
12812 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12813 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0
12814 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
12815 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm14
12816 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm4
12817 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
12818 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12819 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12820 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7]
12821 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17
12822 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12823 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
12824 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
12825 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5
12826 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0
12827 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12828 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
12829 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20
12830 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
12831 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
12832 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
12833 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
12834 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12835 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,1,1,3,4,5,5,7]
12836 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
12837 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
12838 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0
12839 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3,4,5,6,7]
12840 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23
12841 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
12842 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
12843 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
12844 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12845 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7
12846 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12
12847 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
12848 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
12849 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3
12850 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10
12851 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11
12852 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
12853 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
12854 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
12855 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
12856 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3
12857 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12858 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6
12859 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7
12860 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
12861 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22
12862 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30
12863 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
12864 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
12865 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
12866 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16
12867 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
12868 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12869 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
12870 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
12871 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
12872 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3
12873 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1
12874 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7]
12875 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
12876 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
12877 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12878 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12879 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7]
12880 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26
12881 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12882 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4
12883 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12884 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12885 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
12886 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14
12887 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12888 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6,7]
12889 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12890 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
12891 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
12892 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12893 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
12894 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
12895 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6
12896 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6
12897 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12898 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0
12899 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
12900 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm17
12901 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
12902 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
12903 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
12904 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
12905 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12906 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2]
12907 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
12908 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
12909 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
12910 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0
12911 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
12912 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28
12913 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12914 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
12915 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
12916 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
12917 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12918 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
12919 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12920 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12921 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
12922 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
12923 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7
12924 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
12925 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12926 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12927 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
12928 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
12929 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5
12930 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5
12931 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12932 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm9
12933 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0
12934 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
12935 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
12936 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
12937 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5
12938 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12939 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2]
12940 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
12941 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
12942 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
12943 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12944 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12945 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
12946 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
12947 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12948 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12949 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
12950 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12951 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12952 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0
12953 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
12954 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
12955 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
12956 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8
12957 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
12958 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7
12959 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1
12960 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12961 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
12962 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
12963 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
12964 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
12965 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
12966 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
12967 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5
12968 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1
12969 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12970 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15
12971 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10
12972 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
12973 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12974 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12975 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,1,3]
12976 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12977 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
12978 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12979 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
12980 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12981 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
12982 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6
12983 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7
12984 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
12985 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
12986 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12987 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2
12988 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12
12989 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7]
12990 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
12991 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
12992 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
12993 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
12994 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12995 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5]
12996 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12997 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
12998 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5
12999 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6
13000 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
13001 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0
13002 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13003 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
13004 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
13005 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7]
13006 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
13007 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
13008 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13009 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6
13010 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7
13011 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7]
13012 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
13013 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
13014 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13015 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
13016 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13
13017 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm8
13018 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4
13019 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
13020 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0
13021 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13022 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7]
13023 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8
13024 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7]
13025 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
13026 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
13027 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1
13028 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5
13029 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7]
13030 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
13031 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6]
13032 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
13033 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
13034 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11
13035 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9
13036 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
13037 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
13038 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13039 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
13040 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14
13041 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
13042 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
13043 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
13044 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
13045 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
13046 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13047 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
13048 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
13049 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
13050 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
13051 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
13052 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
13053 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
13054 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0
13055 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13056 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7]
13057 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
13058 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
13059 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
13060 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
13061 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
13062 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13063 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
13064 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
13065 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
13066 ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
13067 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
13068 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
13069 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k1} # 16-byte Folded Reload
13070 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13071 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7]
13072 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm18
13073 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm20
13074 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
13075 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
13076 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
13077 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7]
13078 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13079 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
13080 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
13081 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13082 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
13083 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
13084 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13085 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
13086 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
13087 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13088 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1
13089 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2
13090 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
13091 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7
13092 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8
13093 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13094 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
13095 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3
13096 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2
13097 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
13098 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17
13099 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5
13100 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
13101 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
13102 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
13103 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
13104 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
13105 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2
13106 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
13107 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13108 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1
13109 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3
13110 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
13111 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9
13112 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23
13113 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
13114 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
13115 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
13116 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
13117 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13118 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13119 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
13120 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13121 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
13122 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21
13123 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22
13124 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
13125 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
13126 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
13127 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
13128 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
13129 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13130 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5]
13131 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
13132 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
13133 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13134 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload
13135 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13136 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4
13137 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10
13138 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
13139 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
13140 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
13141 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14
13142 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15
13143 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
13144 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
13145 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15]
13146 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
13147 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
13148 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
13149 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7]
13150 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7]
13151 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0
13152 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2
13153 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
13154 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25
13155 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12
13156 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
13157 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
13158 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
13159 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
13160 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
13161 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm12[6,7]
13162 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13163 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
13164 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12
13165 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6],xmm12[7]
13166 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7]
13167 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
13168 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
13169 ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
13170 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
13171 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
13172 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12
13173 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3,4,5,6],xmm12[7]
13174 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
13175 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3
13176 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
13177 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13
13178 ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25
13179 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
13180 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
13181 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
13182 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
13183 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
13184 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
13185 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm11[6,7]
13186 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13187 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
13188 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13189 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7]
13190 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
13191 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1
13192 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
13193 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
13194 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
13195 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0
13196 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3,4,5,6],xmm0[7]
13197 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
13198 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2
13199 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
13200 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13201 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
13202 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
13203 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
13204 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
13205 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
13206 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13207 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
13208 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13209 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
13210 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23
13211 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23>
13212 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11
13213 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13214 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
13215 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
13216 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13217 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
13218 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
13219 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11
13220 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
13221 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm11
13222 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
13223 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17
13224 ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25
13225 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
13226 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1]
13227 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
13228 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
13229 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13230 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
13231 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13232 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15
13233 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7]
13234 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6
13235 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13236 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6,7,8],ymm0[9],ymm6[10,11,12,13,14,15]
13237 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
13238 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm3
13239 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13240 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7]
13241 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6
13242 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm6
13243 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
13244 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
13245 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
13246 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
13247 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
13248 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
13249 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13250 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
13251 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13252 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
13253 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1
13254 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
13255 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
13256 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
13257 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
13258 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
13259 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13260 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,0,4,5,6,4]
13261 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
13262 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
13263 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
13264 ; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7]
13265 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
13266 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm30
13267 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1
13268 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7
13269 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
13270 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
13271 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7]
13272 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4
13273 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13
13274 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7]
13275 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13276 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7]
13277 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
13278 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10
13279 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7]
13280 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
13281 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
13282 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11
13283 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0
13284 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7]
13285 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
13286 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13287 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4,5,6,7],ymm11[8,9,10],ymm6[11,12,13,14,15]
13288 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
13289 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13290 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0
13291 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6
13292 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7]
13293 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
13294 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
13295 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
13296 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
13297 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13298 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,0,4,5,6,4]
13299 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
13300 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
13301 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
13302 ; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7]
13303 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
13304 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm25
13305 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
13306 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23
13307 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24
13308 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
13309 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7]
13310 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7]
13311 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22
13312 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21
13313 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,2,0]
13314 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6
13315 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
13316 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3
13317 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13318 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
13319 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
13320 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm0, %ymm0
13321 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
13322 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
13323 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
13324 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7],ymm6[8,9,10],ymm11[11,12,13,14,15]
13325 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
13326 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13327 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13328 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
13329 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0
13330 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7]
13331 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29
13332 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
13333 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11
13334 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13335 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
13336 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13337 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
13338 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5
13339 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
13340 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm26
13341 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12
13342 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
13343 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
13344 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
13345 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
13346 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
13347 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13348 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7]
13349 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16
13350 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm12
13351 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
13352 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
13353 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
13354 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
13355 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
13356 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7]
13357 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm9
13358 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8
13359 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
13360 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
13361 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
13362 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
13363 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13364 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
13365 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11
13366 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
13367 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
13368 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11
13369 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
13370 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13371 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13372 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
13373 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0
13374 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5],ymm0[6],ymm4[7]
13375 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27
13376 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6
13377 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
13378 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15]
13379 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13380 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13381 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7]
13382 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28
13383 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13384 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
13385 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
13386 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
13387 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
13388 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7]
13389 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13390 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
13391 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm4
13392 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13393 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5
13394 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
13395 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15
13396 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
13397 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7]
13398 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
13399 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7
13400 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2
13401 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13402 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
13403 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm22[0,1,0,1]
13404 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13405 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7,8,9,10],ymm11[11],ymm6[12,13,14,15]
13406 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1
13407 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4
13408 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3
13409 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
13410 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13411 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7]
13412 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13413 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13414 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
13415 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm6, %ymm1
13416 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13417 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13418 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
13419 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31
13420 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm24
13421 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13422 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
13423 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1
13424 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7]
13425 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21
13426 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22
13427 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6
13428 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7]
13429 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
13430 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16
13431 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm23
13432 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11
13433 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2],xmm6[3],xmm11[4,5,6,7]
13434 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
13435 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
13436 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
13437 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
13438 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13439 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
13440 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
13441 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0
13442 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13443 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13444 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10
13445 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14
13446 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
13447 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13448 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13449 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9
13450 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8
13451 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
13452 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6
13453 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
13454 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
13455 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13456 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
13457 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
13458 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
13459 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
13460 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13461 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13462 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7]
13463 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13464 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
13465 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
13466 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7]
13467 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm19
13468 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17
13469 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11
13470 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7]
13471 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
13472 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
13473 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26
13474 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29
13475 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15
13476 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4,5,6,7]
13477 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13478 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
13479 ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
13480 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0
13481 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13482 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
13483 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15
13484 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5
13485 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7]
13486 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13487 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13488 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
13489 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12
13490 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7]
13491 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4
13492 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6
13493 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
13494 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
13495 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13496 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
13497 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
13498 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13499 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
13500 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm18
13501 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7]
13502 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13503 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
13504 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
13505 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6
13506 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
13507 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
13508 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
13509 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
13510 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
13511 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0
13512 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
13513 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
13514 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1
13515 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
13516 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
13517 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
13518 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1
13519 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2
13520 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
13521 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
13522 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
13523 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
13524 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
13525 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13526 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2
13527 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3
13528 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7]
13529 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1]
13530 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
13531 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2
13532 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
13533 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
13534 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
13535 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13536 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13537 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13538 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16
13539 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
13540 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
13541 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
13542 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
13543 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7]
13544 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1]
13545 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6,7,8],ymm15[9],ymm8[10,11,12,13,14,15]
13546 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6
13547 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
13548 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15
13549 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
13550 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
13551 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7]
13552 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
13553 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
13554 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
13555 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm20, %zmm24
13556 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4
13557 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5
13558 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
13559 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
13560 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
13561 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6
13562 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4
13563 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5
13564 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
13565 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
13566 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
13567 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
13568 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7]
13569 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
13570 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13571 ; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
13572 ; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7]
13573 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13574 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
13575 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
13576 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13577 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
13578 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
13579 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload
13580 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload
13581 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm1
13582 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm3
13583 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
13584 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm5
13585 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
13586 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13587 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
13588 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
13589 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
13590 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
13591 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
13592 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
13593 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
13594 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
13595 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
13596 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm23 # 32-byte Folded Reload
13597 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4
13598 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
13599 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13600 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm12 # 64-byte Folded Reload
13601 ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
13602 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1
13603 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1}
13604 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
13605 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload
13606 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1}
13607 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7
13608 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm30 # 64-byte Folded Reload
13609 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm30 {%k1}
13610 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm25 # 64-byte Folded Reload
13611 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1}
13612 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
13613 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload
13614 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1}
13615 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi)
13616 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rsi)
13617 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
13618 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
13619 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx)
13620 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx)
13621 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8)
13622 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%r8)
13623 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9)
13624 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13625 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1}
13626 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13627 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload
13628 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1}
13629 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9)
13630 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13631 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax)
13632 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm11 # 64-byte Folded Reload
13633 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0
13634 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1}
13635 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm11 {%k1}
13636 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
13637 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13638 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
13639 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0
13640 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1}
13641 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax)
13642 ; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748
13643 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper
13644 ; AVX512F-ONLY-SLOW-NEXT: retq
13646 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64:
13647 ; AVX512F-ONLY-FAST: # %bb.0:
13648 ; AVX512F-ONLY-FAST-NEXT: subq $1768, %rsp # imm = 0x6E8
13649 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5
13650 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4
13651 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u>
13652 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0
13653 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
13654 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2
13655 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16
13656 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4
13657 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19
13658 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5
13659 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6
13660 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
13661 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21
13662 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12
13663 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5
13664 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
13665 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
13666 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6
13667 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
13668 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
13669 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm22
13670 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7
13671 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
13672 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6
13673 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20
13674 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7
13675 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
13676 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13677 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24
13678 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2]
13679 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13680 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2
13681 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6
13682 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
13683 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
13684 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8
13685 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
13686 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
13687 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3
13688 ; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm29
13689 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13
13690 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2
13691 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3
13692 ; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13693 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13694 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15
13695 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13696 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13697 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
13698 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17
13699 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18
13700 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
13701 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
13702 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
13703 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3
13704 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0
13705 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13706 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11
13707 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3
13708 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7]
13709 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
13710 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
13711 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
13712 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5
13713 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
13714 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
13715 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9
13716 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
13717 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7]
13718 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm6
13719 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
13720 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10
13721 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
13722 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13723 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14
13724 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15
13725 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
13726 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
13727 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
13728 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8
13729 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm1
13730 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13731 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1
13732 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13733 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0
13734 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1
13735 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
13736 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21
13737 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23
13738 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
13739 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
13740 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm30
13741 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8
13742 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,0,2]
13743 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13744 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5
13745 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28
13746 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7]
13747 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4
13748 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm2
13749 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7]
13750 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8
13751 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u>
13752 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0
13753 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13754 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
13755 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13756 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
13757 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13758 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
13759 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8
13760 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7]
13761 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27
13762 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13763 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm9
13764 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13765 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9
13766 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
13767 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8
13768 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8
13769 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13770 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
13771 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm16
13772 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25
13773 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
13774 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
13775 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
13776 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8
13777 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13778 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
13779 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm11, %ymm12
13780 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13781 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7]
13782 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
13783 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm20
13784 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm17
13785 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
13786 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13
13787 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm3
13788 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13789 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7]
13790 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3
13791 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13792 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13793 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
13794 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
13795 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10
13796 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm7, %zmm7
13797 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13798 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5
13799 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm5
13800 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13801 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5
13802 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7
13803 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
13804 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7
13805 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
13806 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5
13807 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13808 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm11, %ymm7
13809 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13810 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
13811 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
13812 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18
13813 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm31
13814 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9
13815 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm2
13816 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13817 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
13818 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9
13819 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7]
13820 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
13821 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm10
13822 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
13823 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10
13824 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
13825 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9
13826 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0
13827 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13828 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4
13829 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6
13830 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
13831 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
13832 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7]
13833 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
13834 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9
13835 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13
13836 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
13837 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3]
13838 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm15
13839 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7]
13840 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
13841 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8
13842 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm0
13843 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13844 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7]
13845 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15
13846 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2],xmm8[3],xmm15[4,5,6,7]
13847 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8
13848 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm12
13849 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5
13850 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0
13851 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13852 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1
13853 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2
13854 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7]
13855 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8
13856 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7]
13857 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5
13858 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13859 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm30[0,1,1,3]
13860 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10
13861 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7]
13862 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7
13863 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0
13864 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13865 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
13866 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
13867 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7]
13868 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
13869 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3
13870 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13871 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
13872 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm5
13873 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7]
13874 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm5
13875 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm10
13876 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9
13877 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
13878 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25
13879 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3
13880 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3
13881 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm20
13882 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [3,6,10,13,3,6,10,13]
13883 ; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3]
13884 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm23, %zmm4
13885 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
13886 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4
13887 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
13888 ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0
13889 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
13890 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 {%k1} # 16-byte Folded Reload
13891 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13892 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
13893 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
13894 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
13895 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0
13896 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13897 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1
13898 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
13899 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1
13900 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm6
13901 ; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm18, %xmm2
13902 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13903 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22
13904 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
13905 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm2, %ymm0
13906 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
13907 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1
13908 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4
13909 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12]
13910 ; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1]
13911 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm3
13912 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
13913 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3
13914 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
13915 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm13
13916 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3
13917 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
13918 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
13919 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13920 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3
13921 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8
13922 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
13923 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17
13924 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18
13925 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
13926 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7]
13927 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
13928 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
13929 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19
13930 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28
13931 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14>
13932 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm3
13933 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
13934 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3
13935 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
13936 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7]
13937 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13938 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1
13939 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm7
13940 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7
13941 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
13942 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload
13943 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13944 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm1
13945 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1
13946 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm2
13947 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
13948 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
13949 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm8
13950 ; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm1
13951 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
13952 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21
13953 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
13954 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13955 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0]
13956 ; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1]
13957 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm0
13958 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
13959 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
13960 ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1]
13961 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm1
13962 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
13963 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1
13964 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
13965 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
13966 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
13967 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2
13968 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
13969 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
13970 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13971 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4
13972 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm6
13973 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
13974 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
13975 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7]
13976 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
13977 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0
13978 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm1
13979 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
13980 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm2
13981 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1
13982 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12
13983 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3],xmm12[4],xmm3[5],xmm12[6,7]
13984 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0
13985 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3
13986 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29
13987 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12
13988 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm11
13989 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4,5,6],xmm11[7]
13990 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
13991 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm11
13992 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm5, %zmm5
13993 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
13994 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13995 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13996 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
13997 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13
13998 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm10
13999 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5
14000 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6],xmm5[7]
14001 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
14002 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3
14003 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,u,u,u,4,8,11,15>
14004 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12
14005 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
14006 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12
14007 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3,4,5,6],xmm12[7]
14008 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
14009 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm15, %zmm12
14010 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
14011 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
14012 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7]
14013 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14014 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6
14015 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
14016 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
14017 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12
14018 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6],xmm12[7]
14019 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3
14020 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm5
14021 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2
14022 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7]
14023 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14024 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm3
14025 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
14026 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14027 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14028 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7]
14029 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm17
14030 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
14031 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
14032 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
14033 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
14034 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15>
14035 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm5
14036 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
14037 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5
14038 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1
14039 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm23, %zmm5
14040 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
14041 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
14042 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14043 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7]
14044 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10
14045 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
14046 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7]
14047 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
14048 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm2
14049 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2
14050 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1
14051 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm0
14052 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0
14053 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14054 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14055 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm9, %ymm0
14056 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
14057 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm15, %zmm1
14058 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1
14059 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
14060 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1
14061 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
14062 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2
14063 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
14064 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
14065 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14066 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1
14067 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm14
14068 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7]
14069 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm8
14070 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14071 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
14072 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4
14073 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11
14074 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
14075 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
14076 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
14077 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
14078 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
14079 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14080 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,2,11,12,5,8,9]
14081 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm9
14082 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
14083 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9
14084 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
14085 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
14086 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1
14087 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14088 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14089 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1
14090 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm13
14091 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7]
14092 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19
14093 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14094 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
14095 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
14096 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm1
14097 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1
14098 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7
14099 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
14100 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5
14101 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7]
14102 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14103 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14104 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
14105 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
14106 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14107 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14108 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
14109 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14110 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7]
14111 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
14112 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
14113 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20
14114 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1
14115 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
14116 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14117 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,3,7,10,14,u,u,u>
14118 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
14119 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm5
14120 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
14121 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5
14122 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6
14123 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7]
14124 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14125 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25
14126 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14127 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7]
14128 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm21
14129 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5
14130 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7]
14131 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
14132 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm23
14133 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11
14134 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7]
14135 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
14136 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1
14137 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14138 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,u,u,u,6,9,13,u>
14139 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm18, %zmm0
14140 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
14141 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
14142 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14143 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
14144 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0
14145 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14146 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14147 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14148 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
14149 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
14150 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8
14151 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1
14152 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
14153 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
14154 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14155 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
14156 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1
14157 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1
14158 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
14159 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm22
14160 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14161 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6
14162 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7]
14163 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19
14164 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14165 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
14166 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0
14167 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm1
14168 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
14169 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
14170 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13
14171 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18
14172 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5
14173 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7]
14174 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14175 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14176 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
14177 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1
14178 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14179 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31
14180 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7]
14181 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm24
14182 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25
14183 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14184 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1
14185 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm27
14186 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
14187 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0
14188 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14189 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
14190 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm2
14191 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22
14192 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
14193 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
14194 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
14195 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14196 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5
14197 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7]
14198 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12
14199 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7]
14200 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8
14201 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2
14202 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7]
14203 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15
14204 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7]
14205 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
14206 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0
14207 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14208 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u>
14209 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm1
14210 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
14211 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1
14212 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14213 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
14214 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1
14215 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14216 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30
14217 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
14218 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21
14219 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23
14220 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
14221 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3
14222 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
14223 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0
14224 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14225 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm17 # 32-byte Folded Reload
14226 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1
14227 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1
14228 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
14229 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9
14230 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6,7]
14231 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19
14232 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
14233 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
14234 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload
14235 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0
14236 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm4
14237 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3
14238 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10
14239 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
14240 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm18
14241 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11
14242 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7]
14243 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14244 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
14245 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
14246 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3
14247 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14248 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27
14249 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0
14250 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3
14251 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
14252 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
14253 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
14254 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3
14255 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
14256 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
14257 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u>
14258 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm11
14259 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
14260 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11
14261 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7]
14262 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
14263 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
14264 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
14265 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7]
14266 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
14267 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8
14268 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7
14269 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7]
14270 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
14271 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3]
14272 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm13
14273 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
14274 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13
14275 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4,5,6,7]
14276 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
14277 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6
14278 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
14279 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
14280 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
14281 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5
14282 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8
14283 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7]
14284 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15
14285 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4
14286 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
14287 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
14288 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
14289 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm3, %zmm3
14290 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3
14291 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7]
14292 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5
14293 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7]
14294 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
14295 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7]
14296 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8
14297 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5
14298 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7]
14299 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11
14300 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9
14301 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7]
14302 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
14303 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
14304 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10
14305 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7
14306 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14307 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
14308 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload
14309 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14310 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
14311 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
14312 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload
14313 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 64-byte Folded Reload
14314 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm4
14315 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm11
14316 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7]
14317 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14318 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
14319 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
14320 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
14321 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm5 # 64-byte Folded Reload
14322 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
14323 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
14324 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
14325 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
14326 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
14327 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15
14328 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19
14329 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm12
14330 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
14331 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
14332 ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
14333 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1
14334 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1}
14335 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
14336 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload
14337 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1}
14338 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
14339 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm9 # 64-byte Folded Reload
14340 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1}
14341 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
14342 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload
14343 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1}
14344 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14345 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1}
14346 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
14347 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1}
14348 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload
14349 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1}
14350 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi)
14351 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi)
14352 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx)
14353 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx)
14354 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx)
14355 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx)
14356 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8)
14357 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8)
14358 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9)
14359 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9)
14360 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
14361 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
14362 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14363 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload
14364 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1}
14365 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax)
14366 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload
14367 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1}
14368 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
14369 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax)
14370 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
14371 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1}
14372 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax)
14373 ; AVX512F-ONLY-FAST-NEXT: addq $1768, %rsp # imm = 0x6E8
14374 ; AVX512F-ONLY-FAST-NEXT: vzeroupper
14375 ; AVX512F-ONLY-FAST-NEXT: retq
14377 ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64:
14378 ; AVX512DQ-SLOW: # %bb.0:
14379 ; AVX512DQ-SLOW-NEXT: subq $1560, %rsp # imm = 0x618
14380 ; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1
14381 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2
14382 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
14383 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20
14384 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23
14385 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
14386 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
14387 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
14388 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
14389 ; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14
14390 ; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12
14391 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
14392 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
14393 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
14394 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
14395 ; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm27
14396 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2
14397 ; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4
14398 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
14399 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22
14400 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
14401 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
14402 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14403 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3
14404 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4
14405 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5
14406 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8
14407 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7]
14408 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
14409 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1
14410 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
14411 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6
14412 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4
14413 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
14414 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
14415 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0
14416 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0
14417 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14418 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13
14419 ; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11
14420 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18
14421 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,1,0,2]
14422 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0
14423 ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1
14424 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
14425 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
14426 ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14427 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14428 ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0
14429 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
14430 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm9
14431 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15]
14432 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14433 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
14434 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
14435 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17
14436 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16
14437 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5
14438 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7]
14439 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
14440 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3
14441 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0
14442 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14443 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7]
14444 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19
14445 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
14446 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
14447 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
14448 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
14449 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14450 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,1,1,3,4,5,5,7]
14451 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14452 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5,6],ymm6[7]
14453 ; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15
14454 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm1[1],xmm15[2,3,4,5,6,7]
14455 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm11
14456 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
14457 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7]
14458 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0
14459 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14460 ; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6
14461 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7]
14462 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24
14463 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21
14464 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15]
14465 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2
14466 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4
14467 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0
14468 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7]
14469 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
14470 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
14471 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5
14472 ; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2
14473 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14474 ; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12
14475 ; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm14
14476 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7]
14477 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5
14478 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
14479 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3
14480 ; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20
14481 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[0,1,0,2]
14482 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14483 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,1,1,3,4,5,5,7]
14484 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14485 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
14486 ; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0
14487 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2
14488 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
14489 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25
14490 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
14491 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
14492 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14493 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14494 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
14495 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30
14496 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14497 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3
14498 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14499 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14500 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
14501 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10
14502 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14503 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1
14504 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14505 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6,7]
14506 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
14507 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
14508 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14509 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
14510 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
14511 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6
14512 ; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0
14513 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14514 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0
14515 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7]
14516 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16
14517 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8
14518 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
14519 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
14520 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
14521 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14522 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2]
14523 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
14524 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14525 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
14526 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
14527 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17
14528 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22
14529 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14530 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
14531 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
14532 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
14533 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14534 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm13
14535 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14536 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15
14537 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14538 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
14539 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
14540 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
14541 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7
14542 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11
14543 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14544 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
14545 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14546 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
14547 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
14548 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5
14549 ; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm0
14550 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14551 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7]
14552 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24
14553 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm9
14554 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7
14555 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
14556 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5
14557 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14558 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2]
14559 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
14560 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14561 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14562 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14
14563 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
14564 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23
14565 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14566 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
14567 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
14568 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14569 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14570 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7]
14571 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14572 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14573 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0
14574 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
14575 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
14576 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14577 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8
14578 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
14579 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7
14580 ; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm7, %ymm30
14581 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7]
14582 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
14583 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
14584 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6
14585 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
14586 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
14587 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5
14588 ; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0
14589 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14590 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12
14591 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4
14592 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7]
14593 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14594 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14595 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,1,3]
14596 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
14597 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5
14598 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14599 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
14600 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14601 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14602 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6
14603 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8
14604 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7
14605 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14606 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14607 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14608 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11
14609 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7]
14610 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
14611 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14612 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1
14613 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,1,1,3]
14614 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14615 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,1,4,5,6,5]
14616 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14617 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
14618 ; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2
14619 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3
14620 ; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5
14621 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
14622 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0
14623 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14624 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7]
14625 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13
14626 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
14627 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
14628 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
14629 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
14630 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14631 ; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5
14632 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6
14633 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7]
14634 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
14635 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
14636 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14637 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
14638 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0
14639 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm22, %xmm7
14640 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
14641 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14
14642 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2
14643 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14644 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm10
14645 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
14646 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
14647 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3],xmm2[4],xmm7[5,6,7]
14648 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
14649 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
14650 ; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1
14651 ; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2
14652 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
14653 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
14654 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
14655 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14656 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
14657 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9
14658 ; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm8
14659 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
14660 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4
14661 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14662 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
14663 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
14664 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
14665 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7]
14666 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
14667 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7
14668 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14669 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
14670 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
14671 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
14672 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15]
14673 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
14674 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
14675 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
14676 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
14677 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14678 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
14679 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14680 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14681 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
14682 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
14683 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
14684 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14685 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,1,4,5,6,5]
14686 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14687 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
14688 ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0
14689 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
14690 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload
14691 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14692 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
14693 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21
14694 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm18
14695 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6
14696 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
14697 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
14698 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
14699 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14700 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
14701 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
14702 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14703 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
14704 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
14705 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14706 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
14707 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
14708 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14709 ; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1
14710 ; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2
14711 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
14712 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6
14713 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7
14714 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
14715 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
14716 ; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3
14717 ; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2
14718 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
14719 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16
14720 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5
14721 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
14722 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
14723 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
14724 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
14725 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
14726 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
14727 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
14728 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14729 ; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3
14730 ; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8
14731 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
14732 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10
14733 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
14734 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
14735 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14736 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
14737 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14738 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14739 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14740 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14741 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
14742 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17
14743 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22
14744 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
14745 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14746 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
14747 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
14748 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
14749 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14750 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,1,4,5,6,5]
14751 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14752 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
14753 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14754 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
14755 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14756 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2
14757 ; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4
14758 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
14759 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14
14760 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
14761 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
14762 ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12
14763 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0
14764 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
14765 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1]
14766 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7,8,9,10,11,12,13],ymm13[14],ymm9[15]
14767 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1
14768 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
14769 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
14770 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7]
14771 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14772 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1
14773 ; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2
14774 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
14775 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
14776 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7]
14777 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
14778 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
14779 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
14780 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14781 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm13[6,7]
14782 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14783 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7]
14784 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23
14785 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13
14786 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6],xmm13[7]
14787 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
14788 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm14
14789 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
14790 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15]
14791 ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
14792 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9
14793 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
14794 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13
14795 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3,4,5,6],xmm13[7]
14796 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7]
14797 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
14798 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3
14799 ; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm29
14800 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm29[2,1,2,3]
14801 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14802 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
14803 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
14804 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
14805 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14806 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm11[6,7]
14807 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14808 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
14809 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
14810 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7]
14811 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9
14812 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15
14813 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7]
14814 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
14815 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1,2,3,4,5,6],ymm13[7,8],ymm11[9,10,11,12,13,14],ymm13[15]
14816 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0
14817 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4,5,6],xmm0[7]
14818 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
14819 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7]
14820 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
14821 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
14822 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14823 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
14824 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
14825 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
14826 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
14827 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
14828 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14829 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
14830 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16
14831 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23>
14832 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9
14833 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14834 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4,5,6,7,8],ymm0[9],ymm9[10,11,12,13,14,15]
14835 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1
14836 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm4[2],ymm1[3,4,5],ymm4[6],ymm1[7]
14837 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13
14838 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7]
14839 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
14840 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9
14841 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
14842 ; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm9
14843 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
14844 ; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm29
14845 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14846 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[0,1,2,1]
14847 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7]
14848 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
14849 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14850 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
14851 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14852 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
14853 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload
14854 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00
14855 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
14856 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
14857 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14858 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9
14859 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7]
14860 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm15
14861 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11
14862 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14863 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
14864 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
14865 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14
14866 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7]
14867 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11
14868 ; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm11
14869 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
14870 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25
14871 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13
14872 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14873 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
14874 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7]
14875 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
14876 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14877 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
14878 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
14879 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload
14880 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
14881 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14882 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0
14883 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm8
14884 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7]
14885 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14886 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
14887 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14888 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14889 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14890 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,1,2,0,4,5,6,4]
14891 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14892 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
14893 ; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
14894 ; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
14895 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
14896 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm24
14897 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7]
14898 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17
14899 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14900 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14901 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm1
14902 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7]
14903 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
14904 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7]
14905 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
14906 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8
14907 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7]
14908 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
14909 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
14910 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13
14911 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0
14912 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
14913 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
14914 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14915 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
14916 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
14917 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm28, %zmm24
14918 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1}
14919 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
14920 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2
14921 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7]
14922 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14923 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
14924 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14925 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14926 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14927 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[0,1,2,0,4,5,6,4]
14928 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14929 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
14930 ; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
14931 ; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
14932 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
14933 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm30
14934 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
14935 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23
14936 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22
14937 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11
14938 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14939 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7]
14940 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20
14941 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
14942 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm11
14943 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6
14944 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7]
14945 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21
14946 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
14947 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7]
14948 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
14949 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0
14950 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7]
14951 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4]
14952 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14953 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15]
14954 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
14955 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm30 # 64-byte Folded Reload
14956 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1}
14957 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14958 ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
14959 ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
14960 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31
14961 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
14962 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13
14963 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14964 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15]
14965 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14966 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14967 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7]
14968 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26
14969 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14
14970 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
14971 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
14972 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
14973 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
14974 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3,4,5,6,7]
14975 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7]
14976 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19
14977 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27
14978 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13
14979 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7]
14980 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7
14981 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
14982 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14983 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1]
14984 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14985 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15]
14986 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5
14987 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
14988 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25
14989 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9
14990 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3,4,5],xmm9[6],xmm14[7]
14991 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
14992 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14993 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14994 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
14995 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13
14996 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15]
14997 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
14998 ; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm9, %ymm9
14999 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
15000 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15001 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
15002 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm2 # 64-byte Folded Reload
15003 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
15004 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15005 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
15006 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
15007 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0
15008 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5],ymm14[6],ymm0[7]
15009 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9
15010 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
15011 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15]
15012 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
15013 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
15014 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2
15015 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3
15016 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
15017 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
15018 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
15019 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
15020 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7]
15021 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
15022 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7]
15023 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
15024 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm13
15025 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4
15026 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
15027 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9
15028 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7]
15029 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
15030 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12
15031 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7]
15032 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15033 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,0,1]
15034 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15035 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6,7,8,9,10],ymm11[11],ymm9[12,13,14,15]
15036 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1
15037 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3
15038 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2
15039 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
15040 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
15041 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7]
15042 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15043 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
15044 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
15045 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1
15046 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15047 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1}
15048 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15049 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
15050 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
15051 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
15052 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8
15053 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1
15054 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7]
15055 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9
15056 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7]
15057 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7
15058 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
15059 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22
15060 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
15061 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7]
15062 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
15063 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
15064 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
15065 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
15066 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29
15067 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15068 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
15069 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
15070 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0
15071 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15072 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5
15073 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
15074 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
15075 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21
15076 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1]
15077 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15]
15078 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7
15079 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7]
15080 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19
15081 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20
15082 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13
15083 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
15084 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
15085 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
15086 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
15087 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
15088 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
15089 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1
15090 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm1[2,3,4,5,6,7]
15091 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
15092 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm13 # 64-byte Folded Reload
15093 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1}
15094 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7]
15095 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
15096 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
15097 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0
15098 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
15099 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm15
15100 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26
15101 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9
15102 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7]
15103 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4
15104 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
15105 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
15106 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29
15107 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31
15108 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11
15109 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7]
15110 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15111 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
15112 ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
15113 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0
15114 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15115 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7
15116 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7]
15117 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm12
15118 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1]
15119 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15]
15120 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1
15121 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14
15122 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6
15123 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7]
15124 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9
15125 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
15126 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7]
15127 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
15128 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
15129 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
15130 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
15131 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15132 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm9
15133 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
15134 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0
15135 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7]
15136 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
15137 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
15138 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1
15139 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2
15140 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
15141 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
15142 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
15143 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
15144 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
15145 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
15146 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
15147 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
15148 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
15149 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0
15150 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7]
15151 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
15152 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
15153 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0
15154 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3
15155 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
15156 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
15157 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
15158 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
15159 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
15160 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
15161 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15162 ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15163 ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
15164 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,3,1]
15165 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
15166 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3
15167 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7]
15168 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
15169 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
15170 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15171 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
15172 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15173 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm11 # 64-byte Folded Reload
15174 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
15175 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5,6,7]
15176 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
15177 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15]
15178 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15179 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
15180 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
15181 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7]
15182 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
15183 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5
15184 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
15185 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
15186 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
15187 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
15188 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15189 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm6
15190 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1
15191 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
15192 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
15193 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
15194 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
15195 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4
15196 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5
15197 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
15198 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
15199 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
15200 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
15201 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
15202 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
15203 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
15204 ; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
15205 ; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
15206 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1]
15207 ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0
15208 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7]
15209 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15210 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
15211 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15212 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
15213 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15214 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15215 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
15216 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
15217 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
15218 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
15219 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15220 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
15221 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
15222 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
15223 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3
15224 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi)
15225 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi)
15226 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
15227 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx)
15228 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15229 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx)
15230 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15231 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx)
15232 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%r8)
15233 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8)
15234 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15235 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9)
15236 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15237 ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9)
15238 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15239 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax)
15240 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax)
15241 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15242 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax)
15243 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax)
15244 ; AVX512DQ-SLOW-NEXT: addq $1560, %rsp # imm = 0x618
15245 ; AVX512DQ-SLOW-NEXT: vzeroupper
15246 ; AVX512DQ-SLOW-NEXT: retq
15248 ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64:
15249 ; AVX512DQ-FAST: # %bb.0:
15250 ; AVX512DQ-FAST-NEXT: subq $1288, %rsp # imm = 0x508
15251 ; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm3
15252 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30
15253 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,5,9,u,12,u,u,u>
15254 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm0
15255 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
15256 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm1
15257 ; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm2, %zmm4
15258 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17
15259 ; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15
15260 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6
15261 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7]
15262 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22
15263 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5
15264 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
15265 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
15266 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6
15267 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
15268 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
15269 ; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm29
15270 ; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7
15271 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
15272 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6
15273 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20
15274 ; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7
15275 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
15276 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19
15277 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2]
15278 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15279 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1
15280 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6
15281 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7
15282 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
15283 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8
15284 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
15285 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
15286 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2
15287 ; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm28
15288 ; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3
15289 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1
15290 ; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm2
15291 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
15292 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15293 ; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13
15294 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15295 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0
15296 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
15297 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm14
15298 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16
15299 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
15300 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
15301 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
15302 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1
15303 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0
15304 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15305 ; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm8
15306 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1
15307 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7]
15308 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm11
15309 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
15310 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
15311 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
15312 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5
15313 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
15314 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
15315 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9
15316 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7]
15317 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm3[1],xmm13[2,3,4,5,6,7]
15318 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm31
15319 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm1
15320 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15321 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
15322 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10
15323 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
15324 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15325 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0
15326 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7]
15327 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
15328 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
15329 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7
15330 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm3
15331 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2
15332 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm7, %ymm0
15333 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15334 ; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0
15335 ; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3
15336 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7]
15337 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23
15338 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25
15339 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
15340 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
15341 ; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm21
15342 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
15343 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2]
15344 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15345 ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4
15346 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26
15347 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7]
15348 ; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm12
15349 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3
15350 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, (%rsp) # 16-byte Spill
15351 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm3[1],xmm12[2,3,4,5,6,7]
15352 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5
15353 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
15354 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0
15355 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15356 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0
15357 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15358 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7]
15359 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7
15360 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15361 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
15362 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
15363 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15364 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm6
15365 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15366 ; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6
15367 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
15368 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5
15369 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5
15370 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15371 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
15372 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20
15373 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
15374 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
15375 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
15376 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5
15377 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm16
15378 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
15379 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5]
15380 ; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm10, %ymm13
15381 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15382 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7]
15383 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5
15384 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
15385 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15386 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm14
15387 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27
15388 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1
15389 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15390 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15391 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
15392 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
15393 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7]
15394 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9
15395 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm2, %zmm2
15396 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15397 ; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
15398 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2
15399 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15400 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm13
15401 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11
15402 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
15403 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
15404 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
15405 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4
15406 ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2
15407 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15408 ; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm10, %ymm4
15409 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15410 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
15411 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
15412 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm22
15413 ; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6
15414 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2
15415 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15416 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7]
15417 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6
15418 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4,5,6,7]
15419 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15]
15420 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm9
15421 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15422 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
15423 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
15424 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2
15425 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm0
15426 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15427 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm3
15428 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8
15429 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
15430 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9
15431 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7]
15432 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
15433 ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2
15434 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14
15435 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
15436 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,1,1,3]
15437 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm12
15438 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
15439 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
15440 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5
15441 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm20
15442 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7]
15443 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12
15444 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7]
15445 ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
15446 ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm12
15447 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6
15448 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0
15449 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15450 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7]
15451 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
15452 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7]
15453 ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5
15454 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6
15455 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,3]
15456 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9
15457 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7]
15458 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4
15459 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm23
15460 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
15461 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
15462 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
15463 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
15464 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
15465 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
15466 ; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
15467 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
15468 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
15469 ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4
15470 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm8
15471 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm31, %xmm6
15472 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
15473 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm25
15474 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2
15475 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
15476 ; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18
15477 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13]
15478 ; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1]
15479 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm3
15480 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
15481 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3
15482 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15483 ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0
15484 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
15485 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm2, %zmm29 {%k1}
15486 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15487 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7]
15488 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
15489 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
15490 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1
15491 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15492 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2
15493 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
15494 ; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2
15495 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4
15496 ; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm3
15497 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15498 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm31
15499 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,3,3,0,3,7,7]
15500 ; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm13, %ymm1
15501 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
15502 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2
15503 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3
15504 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12]
15505 ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1]
15506 ; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm7
15507 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
15508 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7
15509 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15]
15510 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
15511 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm7
15512 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
15513 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2
15514 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15515 ; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm7
15516 ; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm9
15517 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
15518 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm16
15519 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17
15520 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9
15521 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4],xmm2[5],xmm9[6,7]
15522 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
15523 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2
15524 ; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30
15525 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,4,7,11,14>
15526 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm12
15527 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
15528 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm12
15529 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5,6],xmm12[7]
15530 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7]
15531 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
15532 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0
15533 ; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm12
15534 ; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6
15535 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
15536 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 {%k1} # 16-byte Folded Reload
15537 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15538 ; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm13, %ymm0
15539 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0
15540 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm5, %zmm6
15541 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1
15542 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
15543 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
15544 ; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm7, %xmm1
15545 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
15546 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
15547 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15548 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
15549 ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1]
15550 ; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm1, %ymm0
15551 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
15552 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,9,13,2,6,9,13]
15553 ; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1]
15554 ; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm4, %zmm3
15555 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
15556 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
15557 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
15558 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15559 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0
15560 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6
15561 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0
15562 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15563 ; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6
15564 ; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm5, %zmm3
15565 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
15566 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3
15567 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7]
15568 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15569 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0
15570 ; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm2
15571 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
15572 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3
15573 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2
15574 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15
15575 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3],xmm15[4],xmm8[5],xmm15[6,7]
15576 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8
15577 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28
15578 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10
15579 ; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9
15580 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6],xmm9[7]
15581 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
15582 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm9
15583 ; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5
15584 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5
15585 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm5[6,7]
15586 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15587 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
15588 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13
15589 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11
15590 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8
15591 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
15592 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
15593 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5
15594 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,u,4,8,11,15>
15595 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm14, %zmm10
15596 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
15597 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10
15598 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7]
15599 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7]
15600 ; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm4, %zmm10
15601 ; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
15602 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10
15603 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm10[6,7]
15604 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15605 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2
15606 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3
15607 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
15608 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10
15609 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6],xmm10[7]
15610 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5
15611 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm8
15612 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8
15613 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3,4,5,6],xmm8[7]
15614 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
15615 ; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm8
15616 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0
15617 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
15618 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15619 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7]
15620 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16
15621 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15
15622 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5
15623 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
15624 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
15625 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0
15626 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,u,u,u,5,8,12,15>
15627 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm5
15628 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
15629 ; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5
15630 ; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0
15631 ; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm26, %zmm5
15632 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5
15633 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
15634 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15635 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm27 # 64-byte Folded Reload
15636 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00
15637 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
15638 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
15639 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15640 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7]
15641 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11
15642 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3
15643 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9
15644 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
15645 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0
15646 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm8
15647 ; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8
15648 ; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0
15649 ; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm6
15650 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6
15651 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
15652 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15653 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
15654 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
15655 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15656 ; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm1, %ymm0
15657 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
15658 ; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm1
15659 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
15660 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
15661 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1
15662 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
15663 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2
15664 ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
15665 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
15666 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15667 ; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1
15668 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2
15669 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
15670 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm13
15671 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14
15672 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6
15673 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
15674 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm4
15675 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
15676 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
15677 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
15678 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
15679 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
15680 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15681 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9]
15682 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm9, %zmm10
15683 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
15684 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10
15685 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
15686 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
15687 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6
15688 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
15689 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
15690 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm20 {%k1}
15691 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill
15692 ; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1
15693 ; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm2
15694 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
15695 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21
15696 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26
15697 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10
15698 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6],xmm0[7]
15699 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0
15700 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm8
15701 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8
15702 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7]
15703 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22
15704 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
15705 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
15706 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15707 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
15708 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
15709 ; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8
15710 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
15711 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload
15712 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1}
15713 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15714 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
15715 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
15716 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
15717 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
15718 ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8
15719 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm3
15720 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
15721 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
15722 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
15723 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <0,3,7,10,14,u,u,u>
15724 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
15725 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm12
15726 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
15727 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
15728 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2
15729 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7]
15730 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm25
15731 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
15732 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm18
15733 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20
15734 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12
15735 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
15736 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
15737 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17
15738 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13
15739 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
15740 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
15741 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
15742 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0
15743 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15744 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u>
15745 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10
15746 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
15747 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10
15748 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
15749 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
15750 ; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10
15751 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
15752 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1}
15753 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15754 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15755 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15756 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
15757 ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10
15758 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16
15759 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
15760 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
15761 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
15762 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
15763 ; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm10
15764 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10
15765 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7]
15766 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm31
15767 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4
15768 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2
15769 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
15770 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10
15771 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
15772 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0
15773 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm10
15774 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1
15775 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3
15776 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7]
15777 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12
15778 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7]
15779 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15780 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
15781 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
15782 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1
15783 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15784 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1}
15785 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
15786 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26
15787 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21
15788 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
15789 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5
15790 ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
15791 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm27
15792 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15793 ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
15794 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25
15795 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
15796 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u>
15797 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm12
15798 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
15799 ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12
15800 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
15801 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7]
15802 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14
15803 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5
15804 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7]
15805 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15
15806 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5],xmm15[6],xmm12[7]
15807 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm7
15808 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7]
15809 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8
15810 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7]
15811 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
15812 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12
15813 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
15814 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u>
15815 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1
15816 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
15817 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1
15818 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
15819 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
15820 ; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1
15821 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
15822 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
15823 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
15824 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15825 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
15826 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20
15827 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6
15828 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8
15829 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0
15830 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8
15831 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0
15832 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1
15833 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
15834 ; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm22, %zmm8
15835 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0
15836 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8
15837 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm8[2,3,4,5,6,7]
15838 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7]
15839 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17
15840 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18
15841 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8
15842 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7]
15843 ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1
15844 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm8
15845 ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2
15846 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15
15847 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7]
15848 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10
15849 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7]
15850 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15851 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
15852 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
15853 ; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2
15854 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15855 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm9 # 64-byte Folded Reload
15856 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
15857 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0
15858 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1
15859 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
15860 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15861 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
15862 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2
15863 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
15864 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
15865 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u>
15866 ; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm10, %zmm2
15867 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
15868 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2
15869 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7]
15870 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7]
15871 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
15872 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
15873 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7]
15874 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
15875 ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4
15876 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2
15877 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
15878 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
15879 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3]
15880 ; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm11
15881 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
15882 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11
15883 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3,4,5,6,7]
15884 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
15885 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1
15886 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15887 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15]
15888 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
15889 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload
15890 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
15891 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1
15892 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7]
15893 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
15894 ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4
15895 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
15896 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
15897 ; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm4
15898 ; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
15899 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
15900 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
15901 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
15902 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7]
15903 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0
15904 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4
15905 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
15906 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6
15907 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7]
15908 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4
15909 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7]
15910 ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3
15911 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6
15912 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
15913 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
15914 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
15915 ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7
15916 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2
15917 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
15918 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
15919 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
15920 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15921 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
15922 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
15923 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15924 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15925 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
15926 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
15927 ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
15928 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
15929 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload
15930 ; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
15931 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm14
15932 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm6
15933 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rsi)
15934 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rsi)
15935 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx)
15936 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rdx)
15937 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15938 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx)
15939 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15940 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx)
15941 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15942 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8)
15943 ; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
15944 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8)
15945 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%r9)
15946 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15947 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9)
15948 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
15949 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax)
15950 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15951 ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax)
15952 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
15953 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax)
15954 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax)
15955 ; AVX512DQ-FAST-NEXT: addq $1288, %rsp # imm = 0x508
15956 ; AVX512DQ-FAST-NEXT: vzeroupper
15957 ; AVX512DQ-FAST-NEXT: retq
15959 ; AVX512BW-LABEL: load_i16_stride7_vf64:
15960 ; AVX512BW: # %bb.0:
15961 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15962 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
15963 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0
15964 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2
15965 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5
15966 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3
15967 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6
15968 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7
15969 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4
15970 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13
15971 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15
15972 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10
15973 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11
15974 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
15975 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14
15976 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12
15977 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
15978 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
15979 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17
15980 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
15981 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
15982 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9
15983 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
15984 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
15985 ; AVX512BW-NEXT: kmovd %edi, %k2
15986 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
15987 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
15988 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
15989 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18
15990 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
15991 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
15992 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15993 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
15994 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
15995 ; AVX512BW-NEXT: kmovd %edi, %k1
15996 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
15997 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
15998 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
15999 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
16000 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
16001 ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
16002 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
16003 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
16004 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
16005 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19
16006 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
16007 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
16008 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
16009 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
16010 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
16011 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22
16012 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
16013 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
16014 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
16015 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16
16016 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
16017 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
16018 ; AVX512BW-NEXT: kmovd %edi, %k2
16019 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
16020 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
16021 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
16022 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
16023 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
16024 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
16025 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
16026 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
16027 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
16028 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
16029 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21
16030 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
16031 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
16032 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
16033 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
16034 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
16035 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
16036 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24
16037 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
16038 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
16039 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19
16040 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
16041 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
16042 ; AVX512BW-NEXT: kmovd %edi, %k1
16043 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
16044 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
16045 ; AVX512BW-NEXT: kmovd %edi, %k2
16046 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
16047 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
16048 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
16049 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
16050 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
16051 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
16052 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
16053 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
16054 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
16055 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23
16056 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
16057 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
16058 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
16059 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
16060 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
16061 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
16062 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26
16063 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
16064 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
16065 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21
16066 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
16067 ; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
16068 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
16069 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
16070 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
16071 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
16072 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
16073 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
16074 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
16075 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
16076 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
16077 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
16078 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
16079 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
16080 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
16081 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
16082 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
16083 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
16084 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28
16085 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
16086 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
16087 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23
16088 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
16089 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
16090 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
16091 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
16092 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
16093 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
16094 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
16095 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
16096 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
16097 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
16098 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
16099 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
16100 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
16101 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
16102 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
16103 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
16104 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
16105 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
16106 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28
16107 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
16108 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
16109 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30
16110 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
16111 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
16112 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
16113 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
16114 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
16115 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
16116 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
16117 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
16118 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
16119 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
16120 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
16121 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
16122 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
16123 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
16124 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
16125 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
16126 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
16127 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
16128 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
16129 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
16130 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
16131 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
16132 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
16133 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
16134 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
16135 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
16136 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
16137 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
16138 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi)
16139 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi)
16140 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
16141 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx)
16142 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx)
16143 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
16144 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8)
16145 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8)
16146 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9)
16147 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9)
16148 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r10)
16149 ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r10)
16150 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
16151 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
16152 ; AVX512BW-NEXT: vzeroupper
16153 ; AVX512BW-NEXT: retq
16154 %wide.vec = load <448 x i16>, ptr %in.vec, align 64
16155 %strided.vec0 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
16156 %strided.vec1 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
16157 %strided.vec2 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
16158 %strided.vec3 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
16159 %strided.vec4 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
16160 %strided.vec5 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
16161 %strided.vec6 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
16162 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
16163 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
16164 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
16165 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
16166 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
16167 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
16168 store <64 x i16> %strided.vec6, ptr %out.vec6, align 64
16171 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
16175 ; AVX2-ONLY: {{.*}}
16177 ; AVX512-FAST: {{.*}}
16178 ; AVX512BW-ONLY-FAST: {{.*}}
16179 ; AVX512BW-ONLY-SLOW: {{.*}}
16180 ; AVX512BW-SLOW: {{.*}}
16181 ; AVX512DQBW-FAST: {{.*}}
16182 ; AVX512DQBW-SLOW: {{.*}}
16184 ; FALLBACK0: {{.*}}
16185 ; FALLBACK1: {{.*}}
16186 ; FALLBACK10: {{.*}}
16187 ; FALLBACK11: {{.*}}
16188 ; FALLBACK12: {{.*}}
16189 ; FALLBACK2: {{.*}}
16190 ; FALLBACK3: {{.*}}
16191 ; FALLBACK4: {{.*}}
16192 ; FALLBACK5: {{.*}}
16193 ; FALLBACK6: {{.*}}
16194 ; FALLBACK7: {{.*}}
16195 ; FALLBACK8: {{.*}}
16196 ; FALLBACK9: {{.*}}