1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
19 ; SSE-LABEL: load_i16_stride5_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
23 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
24 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
27 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
28 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
29 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
30 ; SSE-NEXT: psrlq $48, %xmm0
31 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
32 ; SSE-NEXT: psrld $16, %xmm1
33 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
34 ; SSE-NEXT: movd %xmm2, (%rsi)
35 ; SSE-NEXT: movd %xmm3, (%rdx)
36 ; SSE-NEXT: movd %xmm4, (%rcx)
37 ; SSE-NEXT: movd %xmm0, (%r8)
38 ; SSE-NEXT: movd %xmm5, (%r9)
41 ; AVX1-ONLY-LABEL: load_i16_stride5_vf2:
43 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
44 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
45 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
46 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
47 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
48 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
49 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
50 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
51 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5
52 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
53 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
54 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
55 ; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi)
56 ; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx)
57 ; AVX1-ONLY-NEXT: vmovd %xmm4, (%rcx)
58 ; AVX1-ONLY-NEXT: vmovd %xmm5, (%r8)
59 ; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
60 ; AVX1-ONLY-NEXT: retq
62 ; AVX2-SLOW-LABEL: load_i16_stride5_vf2:
64 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
65 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
66 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
67 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
68 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
69 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
70 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
71 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
72 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
73 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
74 ; AVX2-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5
75 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
76 ; AVX2-SLOW-NEXT: vmovd %xmm2, (%rsi)
77 ; AVX2-SLOW-NEXT: vmovd %xmm3, (%rdx)
78 ; AVX2-SLOW-NEXT: vmovd %xmm4, (%rcx)
79 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%r8)
80 ; AVX2-SLOW-NEXT: vmovd %xmm1, (%r9)
81 ; AVX2-SLOW-NEXT: retq
83 ; AVX2-FAST-LABEL: load_i16_stride5_vf2:
85 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
86 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
87 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
88 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
89 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
90 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0
91 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
92 ; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5
93 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
94 ; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi)
95 ; AVX2-FAST-NEXT: vmovd %xmm3, (%rdx)
96 ; AVX2-FAST-NEXT: vmovd %xmm4, (%rcx)
97 ; AVX2-FAST-NEXT: vmovd %xmm0, (%r8)
98 ; AVX2-FAST-NEXT: vmovd %xmm1, (%r9)
99 ; AVX2-FAST-NEXT: retq
101 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf2:
102 ; AVX2-FAST-PERLANE: # %bb.0:
103 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
104 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1
105 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
106 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
107 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
108 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm0, %xmm0
109 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
110 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 8(%rdi), %xmm5
111 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
112 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm2, (%rsi)
113 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm3, (%rdx)
114 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm4, (%rcx)
115 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%r8)
116 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm1, (%r9)
117 ; AVX2-FAST-PERLANE-NEXT: retq
119 ; AVX512-SLOW-LABEL: load_i16_stride5_vf2:
120 ; AVX512-SLOW: # %bb.0:
121 ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
122 ; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
123 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
124 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
125 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
126 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
127 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
128 ; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
129 ; AVX512-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
130 ; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
131 ; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5
132 ; AVX512-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
133 ; AVX512-SLOW-NEXT: vmovd %xmm2, (%rsi)
134 ; AVX512-SLOW-NEXT: vmovd %xmm3, (%rdx)
135 ; AVX512-SLOW-NEXT: vmovd %xmm4, (%rcx)
136 ; AVX512-SLOW-NEXT: vmovd %xmm0, (%r8)
137 ; AVX512-SLOW-NEXT: vmovd %xmm1, (%r9)
138 ; AVX512-SLOW-NEXT: retq
140 ; AVX512-FAST-LABEL: load_i16_stride5_vf2:
141 ; AVX512-FAST: # %bb.0:
142 ; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
143 ; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
144 ; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
145 ; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
146 ; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
147 ; AVX512-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0
148 ; AVX512-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
149 ; AVX512-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5
150 ; AVX512-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
151 ; AVX512-FAST-NEXT: vmovd %xmm2, (%rsi)
152 ; AVX512-FAST-NEXT: vmovd %xmm3, (%rdx)
153 ; AVX512-FAST-NEXT: vmovd %xmm4, (%rcx)
154 ; AVX512-FAST-NEXT: vmovd %xmm0, (%r8)
155 ; AVX512-FAST-NEXT: vmovd %xmm1, (%r9)
156 ; AVX512-FAST-NEXT: retq
157 %wide.vec = load <10 x i16>, ptr %in.vec, align 64
158 %strided.vec0 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 0, i32 5>
159 %strided.vec1 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 1, i32 6>
160 %strided.vec2 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 2, i32 7>
161 %strided.vec3 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 3, i32 8>
162 %strided.vec4 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 4, i32 9>
163 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
164 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
165 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
166 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
167 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
171 define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
172 ; SSE-LABEL: load_i16_stride5_vf4:
174 ; SSE-NEXT: movdqa (%rdi), %xmm2
175 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
176 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
177 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
178 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7]
179 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
180 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
181 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
182 ; SSE-NEXT: movdqa %xmm3, %xmm4
183 ; SSE-NEXT: psrlq $48, %xmm4
184 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3]
185 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
186 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
187 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
188 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
189 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
190 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
191 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
192 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
193 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
194 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
195 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
196 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0]
197 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
198 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
199 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7]
200 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
201 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
202 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
203 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7]
204 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
205 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
206 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
207 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
208 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
209 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
210 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0]
211 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
212 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
213 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
214 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
215 ; SSE-NEXT: pand %xmm7, %xmm2
216 ; SSE-NEXT: pandn %xmm0, %xmm7
217 ; SSE-NEXT: por %xmm2, %xmm7
218 ; SSE-NEXT: movq %xmm1, (%rsi)
219 ; SSE-NEXT: movq %xmm4, (%rdx)
220 ; SSE-NEXT: movq %xmm5, (%rcx)
221 ; SSE-NEXT: movq %xmm6, (%r8)
222 ; SSE-NEXT: movq %xmm7, (%r9)
225 ; AVX1-ONLY-LABEL: load_i16_stride5_vf4:
226 ; AVX1-ONLY: # %bb.0:
227 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
228 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
229 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
230 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
231 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
232 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
233 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
234 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
235 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4
236 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
237 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
238 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
239 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
240 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
241 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3]
242 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
243 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
244 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
245 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
246 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
247 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
248 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
249 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
250 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
251 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
252 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
253 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
254 ; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi)
255 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rdx)
256 ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rcx)
257 ; AVX1-ONLY-NEXT: vmovq %xmm6, (%r8)
258 ; AVX1-ONLY-NEXT: vmovq %xmm1, (%r9)
259 ; AVX1-ONLY-NEXT: retq
261 ; AVX2-SLOW-LABEL: load_i16_stride5_vf4:
262 ; AVX2-SLOW: # %bb.0:
263 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
264 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
265 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
266 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
267 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
268 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
269 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
270 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
271 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
272 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
273 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
274 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
275 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
276 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
277 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
278 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
279 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
280 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
281 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
282 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
283 ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rsi)
284 ; AVX2-SLOW-NEXT: vmovq %xmm4, (%rdx)
285 ; AVX2-SLOW-NEXT: vmovq %xmm5, (%rcx)
286 ; AVX2-SLOW-NEXT: vmovq %xmm6, (%r8)
287 ; AVX2-SLOW-NEXT: vmovq %xmm0, (%r9)
288 ; AVX2-SLOW-NEXT: retq
290 ; AVX2-FAST-LABEL: load_i16_stride5_vf4:
291 ; AVX2-FAST: # %bb.0:
292 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
293 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
294 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
295 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
296 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
297 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
298 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
299 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
300 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
301 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
302 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
303 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
304 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
305 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
306 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
307 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
308 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
309 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
310 ; AVX2-FAST-NEXT: vmovq %xmm3, (%rsi)
311 ; AVX2-FAST-NEXT: vmovq %xmm4, (%rdx)
312 ; AVX2-FAST-NEXT: vmovq %xmm5, (%rcx)
313 ; AVX2-FAST-NEXT: vmovq %xmm6, (%r8)
314 ; AVX2-FAST-NEXT: vmovq %xmm0, (%r9)
315 ; AVX2-FAST-NEXT: retq
317 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf4:
318 ; AVX2-FAST-PERLANE: # %bb.0:
319 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
320 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1
321 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2
322 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
323 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
324 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
325 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
326 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
327 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
328 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
329 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
330 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
331 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
332 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
333 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
334 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
335 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
336 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
337 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%rsi)
338 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%rdx)
339 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rcx)
340 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%r8)
341 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%r9)
342 ; AVX2-FAST-PERLANE-NEXT: retq
344 ; AVX512F-SLOW-LABEL: load_i16_stride5_vf4:
345 ; AVX512F-SLOW: # %bb.0:
346 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
347 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
348 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
349 ; AVX512F-SLOW-NEXT: vpextrw $5, %xmm0, %eax
350 ; AVX512F-SLOW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3
351 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
352 ; AVX512F-SLOW-NEXT: vpextrw $7, %xmm1, %eax
353 ; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
354 ; AVX512F-SLOW-NEXT: vpextrw $6, %xmm0, %eax
355 ; AVX512F-SLOW-NEXT: vpextrw $1, %xmm0, %r10d
356 ; AVX512F-SLOW-NEXT: vmovd %r10d, %xmm4
357 ; AVX512F-SLOW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
358 ; AVX512F-SLOW-NEXT: vpextrw $3, %xmm1, %eax
359 ; AVX512F-SLOW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
360 ; AVX512F-SLOW-NEXT: vmovd %xmm2, %eax
361 ; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
362 ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4
363 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
364 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
365 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
366 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
367 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
368 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
369 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
370 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
371 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
372 ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi)
373 ; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
374 ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rcx)
375 ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r8)
376 ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r9)
377 ; AVX512F-SLOW-NEXT: retq
379 ; AVX512F-FAST-LABEL: load_i16_stride5_vf4:
380 ; AVX512F-FAST: # %bb.0:
381 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
382 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
383 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
384 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
385 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
386 ; AVX512F-FAST-NEXT: vpextrw $7, %xmm1, %eax
387 ; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
388 ; AVX512F-FAST-NEXT: vpextrw $3, %xmm1, %eax
389 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
390 ; AVX512F-FAST-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
391 ; AVX512F-FAST-NEXT: vmovd %xmm2, %eax
392 ; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
393 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm4
394 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
395 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
396 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
397 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
398 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
399 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
400 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
401 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
402 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
403 ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi)
404 ; AVX512F-FAST-NEXT: vmovq %xmm1, (%rdx)
405 ; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx)
406 ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r8)
407 ; AVX512F-FAST-NEXT: vmovq %xmm0, (%r9)
408 ; AVX512F-FAST-NEXT: retq
410 ; AVX512BW-LABEL: load_i16_stride5_vf4:
412 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
413 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
414 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
415 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
416 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1
417 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
418 ; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
419 ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
420 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
421 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
422 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
423 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4
424 ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2
425 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
426 ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5
427 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
428 ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6
429 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
430 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
431 ; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
432 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
433 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
434 ; AVX512BW-NEXT: vzeroupper
435 ; AVX512BW-NEXT: retq
436 %wide.vec = load <20 x i16>, ptr %in.vec, align 64
437 %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
438 %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
439 %strided.vec2 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
440 %strided.vec3 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
441 %strided.vec4 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
442 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
443 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
444 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
445 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
446 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
450 define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
451 ; SSE-LABEL: load_i16_stride5_vf8:
453 ; SSE-NEXT: movdqa 64(%rdi), %xmm6
454 ; SSE-NEXT: movdqa (%rdi), %xmm4
455 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
456 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
457 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
458 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
459 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
460 ; SSE-NEXT: pand %xmm1, %xmm2
461 ; SSE-NEXT: pandn %xmm0, %xmm1
462 ; SSE-NEXT: por %xmm2, %xmm1
463 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
464 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
465 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
466 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
467 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
468 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
469 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
470 ; SSE-NEXT: andps %xmm1, %xmm7
471 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1]
472 ; SSE-NEXT: movaps %xmm1, %xmm2
473 ; SSE-NEXT: pandn %xmm8, %xmm2
474 ; SSE-NEXT: por %xmm7, %xmm2
475 ; SSE-NEXT: movdqa %xmm3, %xmm7
476 ; SSE-NEXT: psrlq $48, %xmm7
477 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3]
478 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
479 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
480 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535]
481 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3]
482 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
483 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
484 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
485 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
486 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7]
487 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7]
488 ; SSE-NEXT: pand %xmm7, %xmm9
489 ; SSE-NEXT: pandn %xmm8, %xmm7
490 ; SSE-NEXT: por %xmm9, %xmm7
491 ; SSE-NEXT: pand %xmm1, %xmm7
492 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0]
493 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
494 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3]
495 ; SSE-NEXT: psllq $48, %xmm6
496 ; SSE-NEXT: pandn %xmm6, %xmm1
497 ; SSE-NEXT: por %xmm7, %xmm1
498 ; SSE-NEXT: movdqa %xmm5, %xmm7
499 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
500 ; SSE-NEXT: movdqa %xmm5, %xmm12
501 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0]
502 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3]
503 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3]
504 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535]
505 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3]
506 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
507 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
508 ; SSE-NEXT: pand %xmm13, %xmm5
509 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
510 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3]
511 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1]
512 ; SSE-NEXT: movdqa %xmm13, %xmm15
513 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,3,2,3,4,5,6,7]
514 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
515 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,3,3,4,5,6,7]
516 ; SSE-NEXT: pand %xmm13, %xmm11
517 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0]
518 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
519 ; SSE-NEXT: movdqa %xmm13, %xmm4
520 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
521 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
522 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
523 ; SSE-NEXT: pand %xmm13, %xmm3
524 ; SSE-NEXT: pandn %xmm12, %xmm13
525 ; SSE-NEXT: por %xmm13, %xmm5
526 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7]
527 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
528 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3]
529 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
530 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7]
531 ; SSE-NEXT: pandn %xmm8, %xmm15
532 ; SSE-NEXT: por %xmm15, %xmm11
533 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0]
534 ; SSE-NEXT: pandn %xmm0, %xmm4
535 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2]
536 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
537 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6]
538 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3]
539 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0]
540 ; SSE-NEXT: por %xmm4, %xmm3
541 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7]
542 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7]
543 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3]
544 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
545 ; SSE-NEXT: movdqa %xmm2, (%rsi)
546 ; SSE-NEXT: movdqa %xmm1, (%rdx)
547 ; SSE-NEXT: movaps %xmm5, (%rcx)
548 ; SSE-NEXT: movaps %xmm11, (%r8)
549 ; SSE-NEXT: movaps %xmm3, (%r9)
552 ; AVX1-ONLY-LABEL: load_i16_stride5_vf8:
553 ; AVX1-ONLY: # %bb.0:
554 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
555 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
556 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
557 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
558 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
559 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6,7]
560 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3]
561 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
562 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
563 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
564 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
565 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
566 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
567 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
568 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm6[7]
569 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm6
570 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3]
571 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
572 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
573 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
574 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u]
575 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7]
576 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm7
577 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
578 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3]
579 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
580 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
581 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
582 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
583 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
584 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0]
585 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
586 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
587 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
588 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15]
589 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
590 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
591 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
592 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7]
593 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3]
594 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
595 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
596 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
597 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
598 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
599 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
600 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
601 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15]
602 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7]
603 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
604 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
605 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
606 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsi)
607 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rdx)
608 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rcx)
609 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r8)
610 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9)
611 ; AVX1-ONLY-NEXT: retq
613 ; AVX2-SLOW-LABEL: load_i16_stride5_vf8:
614 ; AVX2-SLOW: # %bb.0:
615 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
616 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
617 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
618 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
619 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
620 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
621 ; AVX2-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3
622 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
623 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
624 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
625 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
626 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
627 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4
628 ; AVX2-SLOW-NEXT: vpsllq $48, %xmm4, %xmm5
629 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
630 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15]
631 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
632 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
633 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
634 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0]
635 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5]
636 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
637 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
638 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
639 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
640 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
641 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3]
642 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
643 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
644 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
645 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
646 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
647 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
648 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
649 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
650 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
651 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsi)
652 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rdx)
653 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rcx)
654 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8)
655 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r9)
656 ; AVX2-SLOW-NEXT: vzeroupper
657 ; AVX2-SLOW-NEXT: retq
659 ; AVX2-FAST-LABEL: load_i16_stride5_vf8:
660 ; AVX2-FAST: # %bb.0:
661 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
662 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
663 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
664 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
665 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
666 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
667 ; AVX2-FAST-NEXT: vpbroadcastw 70(%rdi), %xmm3
668 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
669 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
670 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
671 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
672 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
673 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4
674 ; AVX2-FAST-NEXT: vpsllq $48, %xmm4, %xmm5
675 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
676 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
677 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
678 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
679 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
680 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11]
681 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
682 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
683 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
684 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
685 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
686 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
687 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
688 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
689 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
690 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
691 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
692 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15]
693 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
694 ; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsi)
695 ; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rdx)
696 ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rcx)
697 ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%r8)
698 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9)
699 ; AVX2-FAST-NEXT: vzeroupper
700 ; AVX2-FAST-NEXT: retq
702 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf8:
703 ; AVX2-FAST-PERLANE: # %bb.0:
704 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
705 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
706 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
707 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
708 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
709 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
710 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 70(%rdi), %xmm3
711 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
712 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
713 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4
714 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
715 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
716 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4
717 ; AVX2-FAST-PERLANE-NEXT: vpsllq $48, %xmm4, %xmm5
718 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7]
719 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
720 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
721 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
722 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
723 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11]
724 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
725 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
726 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7
727 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
728 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
729 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
730 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
731 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
732 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
733 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
734 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
735 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15]
736 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
737 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsi)
738 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rdx)
739 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rcx)
740 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%r8)
741 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%r9)
742 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
743 ; AVX2-FAST-PERLANE-NEXT: retq
745 ; AVX512F-SLOW-LABEL: load_i16_stride5_vf8:
746 ; AVX512F-SLOW: # %bb.0:
747 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
748 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
749 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
750 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
751 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
752 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
753 ; AVX512F-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3
754 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
755 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3
756 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
757 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
758 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
759 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
760 ; AVX512F-SLOW-NEXT: vpsllq $48, %xmm3, %xmm5
761 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
762 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15]
763 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
764 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
765 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
766 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0]
767 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5]
768 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
769 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
770 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
771 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
772 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
773 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
774 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
775 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
776 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
777 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
778 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
779 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
780 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
781 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
782 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3]
783 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsi)
784 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, (%rdx)
785 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rcx)
786 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%r8)
787 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9)
788 ; AVX512F-SLOW-NEXT: vzeroupper
789 ; AVX512F-SLOW-NEXT: retq
791 ; AVX512F-FAST-LABEL: load_i16_stride5_vf8:
792 ; AVX512F-FAST: # %bb.0:
793 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
794 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
795 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
796 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
797 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
798 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u]
799 ; AVX512F-FAST-NEXT: vpbroadcastw 70(%rdi), %xmm3
800 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7]
801 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3
802 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
803 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
804 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
805 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u]
806 ; AVX512F-FAST-NEXT: vpsllq $48, %xmm3, %xmm5
807 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
808 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
809 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
810 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7]
811 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
812 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11]
813 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
814 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
815 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
816 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
817 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
818 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
819 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
820 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
821 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
822 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
823 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
824 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15]
825 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
826 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, (%rsi)
827 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, (%rdx)
828 ; AVX512F-FAST-NEXT: vmovdqa %xmm5, (%rcx)
829 ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%r8)
830 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%r9)
831 ; AVX512F-FAST-NEXT: vzeroupper
832 ; AVX512F-FAST-NEXT: retq
834 ; AVX512BW-LABEL: load_i16_stride5_vf8:
836 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
837 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
838 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35]
839 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
840 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36]
841 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
842 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37]
843 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
844 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38]
845 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
846 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39]
847 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
848 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
849 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
850 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
851 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
852 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
853 ; AVX512BW-NEXT: vzeroupper
854 ; AVX512BW-NEXT: retq
855 %wide.vec = load <40 x i16>, ptr %in.vec, align 64
856 %strided.vec0 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35>
857 %strided.vec1 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36>
858 %strided.vec2 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37>
859 %strided.vec3 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38>
860 %strided.vec4 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39>
861 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
862 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
863 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
864 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
865 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
869 define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
870 ; SSE-LABEL: load_i16_stride5_vf16:
872 ; SSE-NEXT: movdqa 144(%rdi), %xmm14
873 ; SSE-NEXT: movdqa 80(%rdi), %xmm8
874 ; SSE-NEXT: movdqa 96(%rdi), %xmm7
875 ; SSE-NEXT: movdqa 128(%rdi), %xmm15
876 ; SSE-NEXT: movdqa 112(%rdi), %xmm12
877 ; SSE-NEXT: movdqa 64(%rdi), %xmm10
878 ; SSE-NEXT: movdqa (%rdi), %xmm11
879 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
880 ; SSE-NEXT: movdqa 32(%rdi), %xmm13
881 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
882 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
883 ; SSE-NEXT: movdqa %xmm0, %xmm1
884 ; SSE-NEXT: pandn %xmm13, %xmm1
885 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
886 ; SSE-NEXT: movdqa %xmm3, %xmm5
887 ; SSE-NEXT: pand %xmm0, %xmm2
888 ; SSE-NEXT: por %xmm1, %xmm2
889 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3]
890 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
891 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
892 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7]
893 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
894 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
895 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
896 ; SSE-NEXT: andps %xmm6, %xmm4
897 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1]
898 ; SSE-NEXT: movaps %xmm6, %xmm2
899 ; SSE-NEXT: pandn %xmm1, %xmm2
900 ; SSE-NEXT: por %xmm4, %xmm2
901 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
902 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3]
903 ; SSE-NEXT: pand %xmm0, %xmm1
904 ; SSE-NEXT: pandn %xmm12, %xmm0
905 ; SSE-NEXT: por %xmm1, %xmm0
906 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3]
907 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
908 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
909 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
910 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
911 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
912 ; SSE-NEXT: andps %xmm6, %xmm2
913 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1]
914 ; SSE-NEXT: movaps %xmm6, %xmm1
915 ; SSE-NEXT: andnps %xmm0, %xmm1
916 ; SSE-NEXT: orps %xmm2, %xmm1
917 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; SSE-NEXT: movdqa %xmm9, %xmm0
919 ; SSE-NEXT: psrlq $48, %xmm0
920 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3]
921 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
922 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
923 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
924 ; SSE-NEXT: movdqa %xmm0, %xmm2
925 ; SSE-NEXT: pandn %xmm1, %xmm2
926 ; SSE-NEXT: movdqa %xmm5, %xmm3
927 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3]
928 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
929 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
930 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
931 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
932 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
933 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
934 ; SSE-NEXT: pand %xmm0, %xmm1
935 ; SSE-NEXT: por %xmm2, %xmm1
936 ; SSE-NEXT: movdqa %xmm10, %xmm5
937 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
938 ; SSE-NEXT: movdqa %xmm10, %xmm2
939 ; SSE-NEXT: psllq $48, %xmm2
940 ; SSE-NEXT: movaps %xmm6, %xmm4
941 ; SSE-NEXT: andnps %xmm2, %xmm4
942 ; SSE-NEXT: pand %xmm6, %xmm1
943 ; SSE-NEXT: orps %xmm1, %xmm4
944 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
945 ; SSE-NEXT: movdqa %xmm7, %xmm1
946 ; SSE-NEXT: psrlq $48, %xmm1
947 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3]
948 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
949 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
950 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3]
951 ; SSE-NEXT: movdqa %xmm15, %xmm10
952 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
953 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
954 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7]
955 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
956 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
957 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
958 ; SSE-NEXT: pand %xmm0, %xmm1
959 ; SSE-NEXT: pandn %xmm2, %xmm0
960 ; SSE-NEXT: por %xmm1, %xmm0
961 ; SSE-NEXT: pand %xmm6, %xmm0
962 ; SSE-NEXT: movdqa %xmm14, %xmm4
963 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
964 ; SSE-NEXT: movdqa %xmm14, %xmm1
965 ; SSE-NEXT: psllq $48, %xmm1
966 ; SSE-NEXT: pandn %xmm1, %xmm6
967 ; SSE-NEXT: por %xmm0, %xmm6
968 ; SSE-NEXT: movdqa %xmm3, %xmm0
969 ; SSE-NEXT: movdqa %xmm3, %xmm14
970 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
971 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
972 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
973 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
974 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
975 ; SSE-NEXT: movaps %xmm3, %xmm1
976 ; SSE-NEXT: andnps %xmm0, %xmm1
977 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3]
978 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7]
979 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3]
980 ; SSE-NEXT: pand %xmm3, %xmm15
981 ; SSE-NEXT: por %xmm1, %xmm15
982 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
983 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0]
984 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
985 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
986 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0]
987 ; SSE-NEXT: movdqa %xmm10, %xmm5
988 ; SSE-NEXT: movdqa %xmm10, %xmm1
989 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0]
990 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3]
991 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
992 ; SSE-NEXT: movaps %xmm3, %xmm2
993 ; SSE-NEXT: andnps %xmm1, %xmm2
994 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3]
995 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
996 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3]
997 ; SSE-NEXT: pand %xmm3, %xmm0
998 ; SSE-NEXT: por %xmm2, %xmm0
999 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1000 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0]
1001 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
1002 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
1003 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
1004 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
1005 ; SSE-NEXT: movdqa %xmm3, %xmm2
1006 ; SSE-NEXT: pandn %xmm1, %xmm2
1007 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
1008 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
1009 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1010 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
1011 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1012 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
1013 ; SSE-NEXT: pand %xmm3, %xmm1
1014 ; SSE-NEXT: por %xmm2, %xmm1
1015 ; SSE-NEXT: movdqa %xmm14, %xmm4
1016 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0]
1017 ; SSE-NEXT: movdqa %xmm3, %xmm2
1018 ; SSE-NEXT: pandn %xmm13, %xmm2
1019 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2]
1020 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7]
1021 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1022 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3]
1023 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6]
1024 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3]
1025 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0]
1026 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7]
1027 ; SSE-NEXT: movdqa %xmm3, %xmm14
1028 ; SSE-NEXT: pandn %xmm4, %xmm14
1029 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1]
1030 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3]
1031 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
1032 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7]
1033 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1034 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7]
1035 ; SSE-NEXT: pand %xmm3, %xmm13
1036 ; SSE-NEXT: por %xmm14, %xmm13
1037 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0]
1038 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2]
1039 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0]
1040 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
1041 ; SSE-NEXT: movdqa %xmm5, %xmm11
1042 ; SSE-NEXT: movdqa %xmm5, %xmm4
1043 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0]
1044 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7]
1045 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
1046 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7]
1047 ; SSE-NEXT: pand %xmm3, %xmm8
1048 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
1049 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1050 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7]
1051 ; SSE-NEXT: pand %xmm3, %xmm7
1052 ; SSE-NEXT: pandn %xmm12, %xmm3
1053 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2]
1054 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7]
1055 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1056 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3]
1057 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6]
1058 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3]
1059 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0]
1060 ; SSE-NEXT: por %xmm2, %xmm8
1061 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1062 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
1063 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
1064 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1065 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
1066 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
1067 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0]
1068 ; SSE-NEXT: por %xmm7, %xmm3
1069 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3]
1070 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3]
1071 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1072 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
1073 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3]
1074 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
1075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1076 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1077 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1078 ; SSE-NEXT: movaps %xmm2, (%rsi)
1079 ; SSE-NEXT: movdqa %xmm6, 16(%rdx)
1080 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1081 ; SSE-NEXT: movaps %xmm2, (%rdx)
1082 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
1083 ; SSE-NEXT: movaps %xmm15, (%rcx)
1084 ; SSE-NEXT: movaps %xmm13, 16(%r8)
1085 ; SSE-NEXT: movaps %xmm1, (%r8)
1086 ; SSE-NEXT: movaps %xmm3, 16(%r9)
1087 ; SSE-NEXT: movaps %xmm8, (%r9)
1090 ; AVX1-ONLY-LABEL: load_i16_stride5_vf16:
1091 ; AVX1-ONLY: # %bb.0:
1092 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0
1093 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
1094 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7]
1095 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1
1096 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1]
1097 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2
1098 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
1099 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
1100 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
1101 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8
1102 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm7
1103 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
1104 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1105 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7]
1106 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3
1107 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4
1108 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5
1109 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6
1110 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
1111 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7]
1112 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3]
1113 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
1114 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
1115 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
1116 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1117 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
1118 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
1119 ; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm12
1120 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9
1121 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1]
1122 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13
1123 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm12
1124 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
1125 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1126 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7]
1127 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
1128 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3]
1129 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
1130 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
1131 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
1132 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1133 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
1134 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm9, %xmm13
1135 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13
1136 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm14
1137 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3]
1138 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7]
1139 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1140 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1141 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7]
1142 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
1143 ; AVX1-ONLY-NEXT: vandps %ymm11, %ymm14, %ymm11
1144 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm11, %ymm11
1145 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
1146 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
1147 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
1148 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
1149 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
1150 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
1151 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
1152 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1153 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
1154 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3]
1155 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7]
1156 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
1157 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
1158 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
1159 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7]
1160 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0]
1161 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
1162 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
1163 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
1164 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1165 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
1166 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm14
1167 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7]
1168 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7]
1169 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1170 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7]
1171 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
1172 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
1173 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
1174 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
1175 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
1176 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
1177 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
1178 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
1179 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
1180 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
1181 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1182 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
1183 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1184 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
1185 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1186 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1187 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
1188 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1189 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
1190 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
1191 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1192 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
1193 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1194 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1195 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
1196 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
1197 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
1198 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
1199 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1200 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
1201 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1202 ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi)
1203 ; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rdx)
1204 ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx)
1205 ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8)
1206 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
1207 ; AVX1-ONLY-NEXT: vzeroupper
1208 ; AVX1-ONLY-NEXT: retq
1210 ; AVX2-SLOW-LABEL: load_i16_stride5_vf16:
1211 ; AVX2-SLOW: # %bb.0:
1212 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2
1213 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
1214 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0
1215 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
1216 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1217 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
1218 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
1219 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1220 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1221 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
1222 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
1223 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
1224 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
1225 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
1226 ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6
1227 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4
1228 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3]
1229 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1230 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1231 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
1232 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
1233 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1234 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1235 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1236 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1237 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1238 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1239 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
1240 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
1241 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
1242 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
1243 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1244 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1245 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1246 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1247 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1248 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1249 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1250 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1251 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1252 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1253 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1254 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1255 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1256 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
1257 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1258 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1259 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1260 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1261 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1262 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
1263 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1264 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1265 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1266 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1267 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1268 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1269 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1270 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
1271 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1272 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1273 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1274 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1275 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1276 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
1277 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1278 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1279 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1280 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1281 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1282 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1283 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1284 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
1285 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1286 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
1287 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1288 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1289 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1290 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1291 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rsi)
1292 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
1293 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
1294 ; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8)
1295 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9)
1296 ; AVX2-SLOW-NEXT: vzeroupper
1297 ; AVX2-SLOW-NEXT: retq
1299 ; AVX2-FAST-LABEL: load_i16_stride5_vf16:
1300 ; AVX2-FAST: # %bb.0:
1301 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0
1302 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3
1303 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
1304 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1
1305 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2
1306 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
1307 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
1308 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1309 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1310 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15]
1311 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3]
1312 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6
1313 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
1314 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
1315 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
1316 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
1317 ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
1318 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6
1319 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
1320 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1321 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
1322 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
1323 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
1324 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9
1325 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7]
1326 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1327 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
1328 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6>
1329 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9
1330 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
1331 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
1332 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
1333 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
1334 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9
1335 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
1336 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
1337 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
1338 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
1339 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
1340 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
1341 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
1342 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1343 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
1344 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4>
1345 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
1346 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
1347 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1348 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
1349 ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
1350 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10
1351 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7
1352 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
1353 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1354 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
1355 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
1356 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1357 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1358 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
1359 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
1360 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
1361 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
1362 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1363 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
1364 ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1]
1365 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10
1366 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8
1367 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
1368 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1369 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
1370 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
1371 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
1372 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1373 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
1374 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
1375 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
1376 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
1377 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
1378 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7]
1379 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
1380 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
1381 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
1382 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
1383 ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rsi)
1384 ; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rdx)
1385 ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx)
1386 ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r8)
1387 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9)
1388 ; AVX2-FAST-NEXT: vzeroupper
1389 ; AVX2-FAST-NEXT: retq
1391 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf16:
1392 ; AVX2-FAST-PERLANE: # %bb.0:
1393 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
1394 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3
1395 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0
1396 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1
1397 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1398 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
1399 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
1400 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
1401 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1402 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
1403 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
1404 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
1405 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
1406 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6
1407 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4
1408 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5
1409 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3]
1410 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1411 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1412 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
1413 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
1414 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1415 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
1416 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1417 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
1418 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1419 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1420 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
1421 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
1422 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
1423 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3]
1424 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1425 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1426 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1427 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1428 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1429 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
1430 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1431 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1432 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1433 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1434 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1435 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1436 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1437 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3]
1438 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1439 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1440 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1441 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1442 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1443 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10
1444 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1445 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1446 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1447 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1448 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1449 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1450 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1451 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3]
1452 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1453 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1454 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1455 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1456 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1457 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3
1458 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1459 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1460 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1461 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1462 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1463 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1464 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1465 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
1466 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u]
1467 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1468 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1469 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1470 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rsi)
1471 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rdx)
1472 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx)
1473 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8)
1474 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9)
1475 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
1476 ; AVX2-FAST-PERLANE-NEXT: retq
1478 ; AVX512F-SLOW-LABEL: load_i16_stride5_vf16:
1479 ; AVX512F-SLOW: # %bb.0:
1480 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2
1481 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3
1482 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0
1483 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1
1484 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1485 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
1486 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7]
1487 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
1488 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1489 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
1490 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1491 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
1492 ; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm5
1493 ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6
1494 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4
1495 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
1496 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
1497 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1498 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
1499 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
1500 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1501 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
1502 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
1503 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
1504 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1505 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1506 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1507 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
1508 ; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7
1509 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
1510 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
1511 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1512 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
1513 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
1514 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1515 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
1516 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1517 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1518 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1519 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1520 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
1521 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
1522 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
1523 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
1524 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
1525 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1526 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
1527 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1528 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1529 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10
1530 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1531 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1532 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1533 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1534 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
1535 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
1536 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1537 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
1538 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
1539 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1540 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
1541 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1542 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1543 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
1544 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1545 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1546 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1547 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1548 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
1549 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
1550 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1551 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
1552 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1553 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
1554 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1555 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1556 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1557 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1558 ; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rsi)
1559 ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx)
1560 ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
1561 ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8)
1562 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%r9)
1563 ; AVX512F-SLOW-NEXT: vzeroupper
1564 ; AVX512F-SLOW-NEXT: retq
1566 ; AVX512F-FAST-LABEL: load_i16_stride5_vf16:
1567 ; AVX512F-FAST: # %bb.0:
1568 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2
1569 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3
1570 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm0
1571 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1
1572 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
1573 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,u,u,u,4,6,1,3>
1574 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4
1575 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
1576 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
1577 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6
1578 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
1579 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
1580 ; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5
1581 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0]
1582 ; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1]
1583 ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4
1584 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6
1585 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
1586 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
1587 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
1588 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
1589 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1590 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,4,7,1,6>
1591 ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
1592 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
1593 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
1594 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
1595 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
1596 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
1597 ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6
1598 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0]
1599 ; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1]
1600 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9
1601 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
1602 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9
1603 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15]
1604 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
1605 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
1606 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
1607 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
1608 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
1609 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
1610 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4>
1611 ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
1612 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
1613 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1614 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0]
1615 ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1]
1616 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10
1617 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7
1618 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
1619 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1620 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
1621 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10
1622 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
1623 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
1624 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
1625 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
1626 ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10
1627 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
1628 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
1629 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0]
1630 ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1]
1631 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10
1632 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8
1633 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
1634 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1635 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
1636 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3
1637 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
1638 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
1639 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
1640 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,u,u,6,0,3,5>
1641 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1642 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
1643 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1644 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7]
1645 ; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1]
1646 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1
1647 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
1648 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
1649 ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
1650 ; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%rsi)
1651 ; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx)
1652 ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rcx)
1653 ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r8)
1654 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9)
1655 ; AVX512F-FAST-NEXT: vzeroupper
1656 ; AVX512F-FAST-NEXT: retq
1658 ; AVX512BW-LABEL: load_i16_stride5_vf16:
1659 ; AVX512BW: # %bb.0:
1660 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1661 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1662 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
1663 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1664 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
1665 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4
1666 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3
1667 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
1668 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1669 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
1670 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5
1671 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
1672 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
1673 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
1674 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6
1675 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
1676 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
1677 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
1678 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7
1679 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
1680 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1681 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
1682 ; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0
1683 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rsi)
1684 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
1685 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
1686 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
1687 ; AVX512BW-NEXT: vmovdqa %ymm0, (%r9)
1688 ; AVX512BW-NEXT: vzeroupper
1689 ; AVX512BW-NEXT: retq
1690 %wide.vec = load <80 x i16>, ptr %in.vec, align 64
1691 %strided.vec0 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
1692 %strided.vec1 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76>
1693 %strided.vec2 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77>
1694 %strided.vec3 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78>
1695 %strided.vec4 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79>
1696 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
1697 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
1698 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
1699 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
1700 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
1704 define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
1705 ; SSE-LABEL: load_i16_stride5_vf32:
1707 ; SSE-NEXT: subq $408, %rsp # imm = 0x198
1708 ; SSE-NEXT: movdqa 64(%rdi), %xmm4
1709 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1710 ; SSE-NEXT: movdqa (%rdi), %xmm10
1711 ; SSE-NEXT: movdqa 16(%rdi), %xmm13
1712 ; SSE-NEXT: movdqa 32(%rdi), %xmm9
1713 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
1714 ; SSE-NEXT: movdqa 224(%rdi), %xmm7
1715 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1716 ; SSE-NEXT: movdqa 160(%rdi), %xmm11
1717 ; SSE-NEXT: movdqa 176(%rdi), %xmm12
1718 ; SSE-NEXT: movdqa 208(%rdi), %xmm3
1719 ; SSE-NEXT: movdqa 192(%rdi), %xmm2
1720 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1721 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
1722 ; SSE-NEXT: movdqa %xmm0, %xmm1
1723 ; SSE-NEXT: pandn %xmm2, %xmm1
1724 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
1725 ; SSE-NEXT: movdqa %xmm3, %xmm8
1726 ; SSE-NEXT: pand %xmm0, %xmm2
1727 ; SSE-NEXT: por %xmm1, %xmm2
1728 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
1729 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1730 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1731 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
1732 ; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill
1733 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
1734 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1735 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
1736 ; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0]
1737 ; SSE-NEXT: andps %xmm15, %xmm3
1738 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
1739 ; SSE-NEXT: movaps %xmm15, %xmm2
1740 ; SSE-NEXT: pandn %xmm1, %xmm2
1741 ; SSE-NEXT: por %xmm3, %xmm2
1742 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1743 ; SSE-NEXT: movdqa %xmm0, %xmm1
1744 ; SSE-NEXT: pandn %xmm9, %xmm1
1745 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1746 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
1747 ; SSE-NEXT: movdqa %xmm5, %xmm7
1748 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1749 ; SSE-NEXT: pand %xmm0, %xmm2
1750 ; SSE-NEXT: por %xmm1, %xmm2
1751 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
1752 ; SSE-NEXT: movdqa %xmm13, %xmm5
1753 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1755 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
1756 ; SSE-NEXT: movdqa %xmm10, %xmm6
1757 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1758 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
1759 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1760 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
1761 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
1762 ; SSE-NEXT: movaps %xmm15, %xmm2
1763 ; SSE-NEXT: andnps %xmm1, %xmm2
1764 ; SSE-NEXT: movdqa 272(%rdi), %xmm4
1765 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1766 ; SSE-NEXT: andps %xmm15, %xmm3
1767 ; SSE-NEXT: orps %xmm3, %xmm2
1768 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1769 ; SSE-NEXT: movdqa %xmm0, %xmm1
1770 ; SSE-NEXT: pandn %xmm4, %xmm1
1771 ; SSE-NEXT: movdqa 288(%rdi), %xmm2
1772 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1773 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
1774 ; SSE-NEXT: pand %xmm0, %xmm2
1775 ; SSE-NEXT: por %xmm1, %xmm2
1776 ; SSE-NEXT: movdqa 256(%rdi), %xmm3
1777 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
1778 ; SSE-NEXT: movdqa %xmm3, %xmm14
1779 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1780 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1781 ; SSE-NEXT: movdqa 240(%rdi), %xmm13
1782 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
1783 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1784 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
1785 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1786 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
1787 ; SSE-NEXT: movdqa 304(%rdi), %xmm1
1788 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1789 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1790 ; SSE-NEXT: movaps %xmm15, %xmm2
1791 ; SSE-NEXT: andnps %xmm1, %xmm2
1792 ; SSE-NEXT: andps %xmm15, %xmm3
1793 ; SSE-NEXT: orps %xmm3, %xmm2
1794 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1795 ; SSE-NEXT: movdqa 128(%rdi), %xmm1
1796 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1797 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1798 ; SSE-NEXT: pand %xmm0, %xmm1
1799 ; SSE-NEXT: movdqa 112(%rdi), %xmm2
1800 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1801 ; SSE-NEXT: pandn %xmm2, %xmm0
1802 ; SSE-NEXT: por %xmm1, %xmm0
1803 ; SSE-NEXT: movdqa 96(%rdi), %xmm1
1804 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1805 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1806 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1807 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
1808 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
1809 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1810 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
1811 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1812 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1813 ; SSE-NEXT: movdqa 144(%rdi), %xmm0
1814 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1815 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1816 ; SSE-NEXT: movaps %xmm15, %xmm1
1817 ; SSE-NEXT: andnps %xmm0, %xmm1
1818 ; SSE-NEXT: andps %xmm15, %xmm2
1819 ; SSE-NEXT: orps %xmm2, %xmm1
1820 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1821 ; SSE-NEXT: psrlq $48, %xmm12
1822 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3]
1823 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
1824 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
1825 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
1826 ; SSE-NEXT: movdqa %xmm0, %xmm2
1827 ; SSE-NEXT: pandn %xmm1, %xmm2
1828 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1829 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3]
1830 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1831 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
1832 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1833 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
1834 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1835 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1836 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1837 ; SSE-NEXT: pand %xmm0, %xmm1
1838 ; SSE-NEXT: por %xmm2, %xmm1
1839 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1840 ; SSE-NEXT: movdqa %xmm10, %xmm2
1841 ; SSE-NEXT: psllq $48, %xmm2
1842 ; SSE-NEXT: movaps %xmm15, %xmm3
1843 ; SSE-NEXT: andnps %xmm2, %xmm3
1844 ; SSE-NEXT: pand %xmm15, %xmm1
1845 ; SSE-NEXT: orps %xmm1, %xmm3
1846 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1847 ; SSE-NEXT: psrlq $48, %xmm5
1848 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3]
1849 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
1850 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1851 ; SSE-NEXT: movdqa %xmm0, %xmm1
1852 ; SSE-NEXT: pandn %xmm2, %xmm1
1853 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3]
1854 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
1855 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1856 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
1857 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1858 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1859 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
1860 ; SSE-NEXT: pand %xmm0, %xmm2
1861 ; SSE-NEXT: por %xmm1, %xmm2
1862 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1863 ; SSE-NEXT: movdqa %xmm6, %xmm1
1864 ; SSE-NEXT: psllq $48, %xmm1
1865 ; SSE-NEXT: movdqa %xmm15, %xmm3
1866 ; SSE-NEXT: pandn %xmm1, %xmm3
1867 ; SSE-NEXT: pand %xmm15, %xmm2
1868 ; SSE-NEXT: por %xmm2, %xmm3
1869 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1870 ; SSE-NEXT: movdqa %xmm14, %xmm1
1871 ; SSE-NEXT: psrlq $48, %xmm1
1872 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3]
1873 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
1874 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1875 ; SSE-NEXT: movdqa %xmm0, %xmm1
1876 ; SSE-NEXT: pandn %xmm2, %xmm1
1877 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1878 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3]
1879 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1880 ; SSE-NEXT: # xmm3 = mem[0,2,2,3]
1881 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1882 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
1883 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1884 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1885 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
1886 ; SSE-NEXT: pand %xmm0, %xmm2
1887 ; SSE-NEXT: por %xmm1, %xmm2
1888 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1889 ; SSE-NEXT: movdqa %xmm7, %xmm1
1890 ; SSE-NEXT: psllq $48, %xmm1
1891 ; SSE-NEXT: movdqa %xmm15, %xmm3
1892 ; SSE-NEXT: pandn %xmm1, %xmm3
1893 ; SSE-NEXT: pand %xmm15, %xmm2
1894 ; SSE-NEXT: por %xmm2, %xmm3
1895 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1896 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1897 ; SSE-NEXT: movdqa %xmm13, %xmm1
1898 ; SSE-NEXT: psrlq $48, %xmm1
1899 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
1900 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
1901 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1902 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1903 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3]
1904 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1905 ; SSE-NEXT: # xmm3 = mem[0,2,2,3]
1906 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1907 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7]
1908 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1909 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1910 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
1911 ; SSE-NEXT: pand %xmm0, %xmm1
1912 ; SSE-NEXT: pandn %xmm2, %xmm0
1913 ; SSE-NEXT: por %xmm1, %xmm0
1914 ; SSE-NEXT: pand %xmm15, %xmm0
1915 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1916 ; SSE-NEXT: movdqa %xmm5, %xmm1
1917 ; SSE-NEXT: psllq $48, %xmm1
1918 ; SSE-NEXT: pandn %xmm1, %xmm15
1919 ; SSE-NEXT: por %xmm0, %xmm15
1920 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1921 ; SSE-NEXT: movdqa %xmm8, %xmm0
1922 ; SSE-NEXT: movdqa %xmm11, %xmm8
1923 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
1924 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3]
1925 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
1926 ; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535]
1927 ; SSE-NEXT: movaps %xmm11, %xmm1
1928 ; SSE-NEXT: andnps %xmm0, %xmm1
1929 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
1930 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
1931 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
1932 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1933 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1934 ; SSE-NEXT: pand %xmm11, %xmm2
1935 ; SSE-NEXT: por %xmm1, %xmm2
1936 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1937 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
1938 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1939 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
1940 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1941 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1942 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1943 ; SSE-NEXT: movaps %xmm1, %xmm0
1944 ; SSE-NEXT: movaps %xmm1, %xmm15
1945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1946 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
1947 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3]
1948 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
1949 ; SSE-NEXT: movaps %xmm11, %xmm1
1950 ; SSE-NEXT: andnps %xmm0, %xmm1
1951 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1952 ; SSE-NEXT: # xmm2 = mem[0,1,1,3]
1953 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
1954 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1955 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
1956 ; SSE-NEXT: pand %xmm11, %xmm2
1957 ; SSE-NEXT: por %xmm1, %xmm2
1958 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1959 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0]
1960 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1961 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
1962 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1963 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1964 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1965 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0]
1966 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3]
1967 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3]
1968 ; SSE-NEXT: movaps %xmm11, %xmm1
1969 ; SSE-NEXT: andnps %xmm14, %xmm1
1970 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1971 ; SSE-NEXT: # xmm2 = mem[0,1,1,3]
1972 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
1973 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1974 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
1975 ; SSE-NEXT: pand %xmm11, %xmm2
1976 ; SSE-NEXT: por %xmm1, %xmm2
1977 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7]
1978 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0]
1979 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1980 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
1981 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
1982 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1983 ; SSE-NEXT: movdqa %xmm12, %xmm0
1984 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1985 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1986 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1987 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3]
1988 ; SSE-NEXT: movaps %xmm11, %xmm1
1989 ; SSE-NEXT: andnps %xmm0, %xmm1
1990 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1991 ; SSE-NEXT: # xmm2 = mem[0,1,1,3]
1992 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
1993 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
1994 ; SSE-NEXT: pand %xmm11, %xmm2
1995 ; SSE-NEXT: por %xmm1, %xmm2
1996 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1997 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0]
1998 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1999 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2000 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2001 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2002 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7]
2003 ; SSE-NEXT: movdqa %xmm11, %xmm1
2004 ; SSE-NEXT: pandn %xmm0, %xmm1
2005 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2006 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
2007 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2008 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2009 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2010 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7]
2011 ; SSE-NEXT: pand %xmm11, %xmm0
2012 ; SSE-NEXT: por %xmm1, %xmm0
2013 ; SSE-NEXT: movdqa %xmm0, %xmm2
2014 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2015 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0]
2016 ; SSE-NEXT: movaps %xmm11, %xmm0
2017 ; SSE-NEXT: andnps %xmm8, %xmm0
2018 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2019 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2]
2020 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7]
2021 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2022 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
2023 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2024 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2025 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
2026 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2027 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7]
2028 ; SSE-NEXT: movdqa %xmm11, %xmm1
2029 ; SSE-NEXT: pandn %xmm0, %xmm1
2030 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2031 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2032 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2033 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3]
2034 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2035 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2036 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2037 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7]
2038 ; SSE-NEXT: pand %xmm11, %xmm13
2039 ; SSE-NEXT: por %xmm1, %xmm13
2040 ; SSE-NEXT: movaps %xmm15, %xmm0
2041 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0]
2042 ; SSE-NEXT: movdqa %xmm11, %xmm12
2043 ; SSE-NEXT: pandn %xmm9, %xmm12
2044 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2]
2045 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7]
2046 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2047 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
2048 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2049 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2050 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0]
2051 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7]
2052 ; SSE-NEXT: movdqa %xmm11, %xmm1
2053 ; SSE-NEXT: pandn %xmm0, %xmm1
2054 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2055 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2056 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2057 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
2058 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2059 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2060 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2061 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7]
2062 ; SSE-NEXT: pand %xmm11, %xmm9
2063 ; SSE-NEXT: por %xmm1, %xmm9
2064 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2065 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0]
2066 ; SSE-NEXT: movdqa %xmm11, %xmm15
2067 ; SSE-NEXT: pandn %xmm10, %xmm15
2068 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2]
2069 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7]
2070 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3]
2071 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2072 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2073 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0]
2074 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2075 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7]
2076 ; SSE-NEXT: movdqa %xmm11, %xmm1
2077 ; SSE-NEXT: pandn %xmm0, %xmm1
2078 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2079 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2080 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2081 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
2082 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2083 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2085 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7]
2086 ; SSE-NEXT: pand %xmm11, %xmm10
2087 ; SSE-NEXT: por %xmm1, %xmm10
2088 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2089 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
2090 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
2091 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2092 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0]
2093 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2]
2094 ; SSE-NEXT: movaps %xmm14, %xmm2
2095 ; SSE-NEXT: movdqa %xmm3, %xmm1
2096 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0]
2097 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2]
2098 ; SSE-NEXT: movaps %xmm1, %xmm14
2099 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0]
2100 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
2101 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2102 ; SSE-NEXT: movaps %xmm8, %xmm1
2103 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0]
2104 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2105 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2106 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7]
2107 ; SSE-NEXT: pand %xmm11, %xmm3
2108 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7]
2109 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2110 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7]
2111 ; SSE-NEXT: pand %xmm11, %xmm2
2112 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7]
2113 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2114 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
2115 ; SSE-NEXT: pand %xmm11, %xmm0
2116 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
2117 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2118 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7]
2119 ; SSE-NEXT: pand %xmm11, %xmm4
2120 ; SSE-NEXT: pandn %xmm7, %xmm11
2121 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2]
2122 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7]
2123 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2124 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3]
2125 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6]
2126 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3]
2127 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0]
2128 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2129 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2130 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
2131 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2132 ; SSE-NEXT: # xmm14 = mem[0,1,1,3]
2133 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2134 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
2135 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3]
2136 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0]
2137 ; SSE-NEXT: por %xmm12, %xmm2
2138 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2139 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
2140 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2141 ; SSE-NEXT: # xmm12 = mem[0,1,1,3]
2142 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2143 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
2144 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3]
2145 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0]
2146 ; SSE-NEXT: por %xmm15, %xmm0
2147 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2148 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
2149 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2150 ; SSE-NEXT: # xmm5 = mem[0,1,1,3]
2151 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2152 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2153 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3]
2154 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0]
2155 ; SSE-NEXT: por %xmm4, %xmm11
2156 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3]
2157 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3]
2158 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2159 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
2160 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3]
2161 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0]
2162 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2163 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
2164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2165 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
2166 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2167 ; SSE-NEXT: movaps %xmm1, (%rsi)
2168 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2169 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
2170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2171 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
2172 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2173 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
2174 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2175 ; SSE-NEXT: movaps %xmm1, (%rdx)
2176 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2177 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
2178 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2179 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
2180 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2181 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
2182 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2183 ; SSE-NEXT: movaps %xmm1, (%rcx)
2184 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2185 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
2186 ; SSE-NEXT: movaps %xmm10, 16(%r8)
2187 ; SSE-NEXT: movaps %xmm9, 48(%r8)
2188 ; SSE-NEXT: movaps %xmm13, (%r8)
2189 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2190 ; SSE-NEXT: movaps %xmm1, 32(%r8)
2191 ; SSE-NEXT: movaps %xmm11, 16(%r9)
2192 ; SSE-NEXT: movaps %xmm0, 48(%r9)
2193 ; SSE-NEXT: movaps %xmm2, (%r9)
2194 ; SSE-NEXT: movaps %xmm3, 32(%r9)
2195 ; SSE-NEXT: addq $408, %rsp # imm = 0x198
2198 ; AVX1-ONLY-LABEL: load_i16_stride5_vf32:
2199 ; AVX1-ONLY: # %bb.0:
2200 ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8
2201 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2
2202 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
2203 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
2204 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9
2205 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2206 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7
2207 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
2208 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2209 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
2210 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
2211 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11
2212 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2213 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3
2214 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
2215 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10
2216 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2217 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3
2218 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2219 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2220 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2221 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
2222 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
2223 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5
2224 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6
2226 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
2227 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2228 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4
2229 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,0,3]
2230 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15
2231 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
2232 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3]
2233 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12
2234 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2235 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
2236 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2237 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
2238 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2239 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
2240 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
2241 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3
2242 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0
2243 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
2244 ; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm5
2245 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2246 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
2247 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
2248 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
2249 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2250 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2
2251 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2252 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13
2253 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7]
2254 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2255 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
2256 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0
2257 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2258 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2259 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2260 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0
2261 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2262 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2263 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0
2264 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2265 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
2266 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
2267 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
2268 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7]
2269 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0
2270 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2271 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
2272 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2273 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0
2274 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2275 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
2276 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
2277 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2278 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0
2279 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2280 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2281 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2
2282 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7]
2283 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14
2284 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2285 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2286 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0
2287 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1
2288 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2289 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
2290 ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8
2291 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0
2292 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2293 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2294 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm6
2295 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2296 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
2297 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
2298 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2299 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2300 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
2301 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
2302 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
2303 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2304 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7]
2305 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
2306 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2307 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
2308 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2309 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3]
2310 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
2311 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm9
2312 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
2313 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm12
2314 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2315 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2316 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7]
2317 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
2318 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9
2319 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7]
2320 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
2321 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3
2322 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm9
2323 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9
2324 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm9, %ymm3
2325 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2326 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2327 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2328 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7]
2329 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0
2330 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
2331 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
2332 ; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
2333 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7]
2334 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
2335 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2336 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3]
2337 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
2338 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
2339 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
2340 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2341 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7]
2342 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
2343 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2344 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3]
2345 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
2346 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2347 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8
2348 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
2349 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7]
2350 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
2351 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1
2352 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2353 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm14, %xmm3
2354 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3
2355 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1
2356 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2357 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
2359 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
2360 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
2361 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2362 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
2363 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
2364 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
2365 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
2366 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7]
2367 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
2368 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
2369 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
2370 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
2371 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
2372 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2373 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3]
2374 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
2375 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2376 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
2377 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7]
2378 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2379 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0]
2380 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5]
2381 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
2382 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2383 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2384 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2385 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9
2386 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
2387 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4
2388 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2389 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0
2390 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7]
2391 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
2392 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2
2393 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3]
2394 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
2395 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
2396 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7]
2397 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2398 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
2399 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1
2400 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2401 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3]
2402 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
2403 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2404 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
2405 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7]
2406 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0]
2407 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
2408 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
2409 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
2410 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2411 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7]
2412 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
2413 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0
2414 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3
2415 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
2416 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
2417 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
2418 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3
2419 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
2420 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7]
2421 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
2422 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
2423 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7]
2424 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
2425 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
2426 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7]
2427 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3]
2428 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
2429 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
2430 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
2431 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2432 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2433 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2434 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
2435 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0
2436 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2437 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2438 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
2439 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1
2440 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2441 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3
2442 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
2443 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7]
2444 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0
2445 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2446 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
2447 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1
2448 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2449 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2450 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7]
2451 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
2452 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7]
2453 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7]
2454 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3]
2455 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
2456 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
2457 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
2458 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]
2459 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
2460 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
2461 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
2462 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2463 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3]
2464 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
2465 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
2466 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
2467 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2468 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7]
2469 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
2470 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
2471 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7]
2472 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
2473 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
2474 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2475 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7]
2476 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
2477 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2478 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
2479 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
2480 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2481 ; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3]
2482 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
2483 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2484 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3]
2485 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
2486 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2487 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2488 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
2489 ; AVX1-ONLY-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
2490 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
2491 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2492 ; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3]
2493 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
2494 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
2495 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2496 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
2497 ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
2498 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5
2499 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
2500 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
2501 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
2502 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2503 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
2504 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
2505 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
2506 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
2507 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2508 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2509 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi)
2510 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2511 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi)
2512 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2513 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx)
2514 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2515 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx)
2516 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2517 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx)
2518 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2519 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx)
2520 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2521 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
2522 ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8)
2523 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9)
2524 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9)
2525 ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8
2526 ; AVX1-ONLY-NEXT: vzeroupper
2527 ; AVX1-ONLY-NEXT: retq
2529 ; AVX2-SLOW-LABEL: load_i16_stride5_vf32:
2530 ; AVX2-SLOW: # %bb.0:
2531 ; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108
2532 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1
2533 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
2534 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
2535 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12
2536 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4
2537 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5
2538 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7
2539 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6
2540 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
2541 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
2542 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
2543 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
2544 ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
2545 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
2546 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2547 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9
2548 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
2549 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
2550 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8
2551 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
2552 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8
2553 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15]
2554 ; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15
2555 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
2556 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7]
2557 ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
2558 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2559 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2560 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12
2561 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7]
2562 ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10
2563 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12
2564 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
2565 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
2566 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7]
2567 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
2568 ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0
2569 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
2570 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
2571 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
2572 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
2573 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11
2574 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0
2575 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
2576 ; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5
2577 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
2578 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
2579 ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm10
2580 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
2581 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15
2582 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13
2583 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
2584 ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm13
2585 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11
2586 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm14
2587 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9
2588 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3]
2589 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
2590 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10
2591 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2592 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
2593 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7]
2594 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2595 ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11
2596 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm10
2597 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3]
2598 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1
2599 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2600 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
2601 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
2602 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2603 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3]
2604 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
2605 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
2606 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2607 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
2608 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2609 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2610 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3]
2611 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0
2612 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2613 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
2614 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2615 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2616 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
2617 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9
2618 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2619 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2620 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2621 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
2622 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2623 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
2624 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8
2625 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
2626 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
2627 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0
2628 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
2629 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2630 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3]
2631 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
2632 ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
2633 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2634 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
2635 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2636 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2637 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
2638 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6
2639 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2640 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2641 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
2642 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0
2643 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
2644 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15]
2645 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8
2646 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
2647 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
2648 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2649 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3]
2650 ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
2651 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2652 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
2653 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2654 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2655 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
2656 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2657 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
2658 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
2659 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7
2660 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8
2661 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3]
2662 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
2663 ; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0
2664 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
2665 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
2666 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2667 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3]
2668 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
2669 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1
2670 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2671 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
2672 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2673 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2674 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
2675 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
2676 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
2677 ; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0
2678 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15]
2679 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm1
2680 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12
2681 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
2682 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8
2683 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7]
2684 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3]
2685 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2
2686 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2687 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
2688 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2689 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2690 ; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
2691 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
2692 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
2693 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
2694 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
2695 ; AVX2-SLOW-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
2696 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2697 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
2698 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
2699 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1]
2700 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
2701 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
2702 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
2703 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
2704 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3]
2705 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
2706 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
2707 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
2708 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
2709 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2710 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
2711 ; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
2712 ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
2713 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
2714 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
2715 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3
2716 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
2717 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
2718 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
2719 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
2720 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
2721 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
2722 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
2723 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
2724 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
2725 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2726 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2727 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
2728 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2729 ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi)
2730 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2731 ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi)
2732 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2733 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx)
2734 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2735 ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx)
2736 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2737 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx)
2738 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2739 ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx)
2740 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2741 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8)
2742 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8)
2743 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9)
2744 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9)
2745 ; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108
2746 ; AVX2-SLOW-NEXT: vzeroupper
2747 ; AVX2-SLOW-NEXT: retq
2749 ; AVX2-FAST-LABEL: load_i16_stride5_vf32:
2750 ; AVX2-FAST: # %bb.0:
2751 ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128
2752 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15
2753 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1
2754 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3
2755 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14
2756 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2
2757 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
2758 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4
2759 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5
2760 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
2761 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7
2762 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2763 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6
2764 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2765 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
2766 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8
2767 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
2768 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8
2769 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
2770 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5
2771 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4
2772 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12
2773 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7]
2774 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
2775 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13
2776 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
2777 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8
2778 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15]
2779 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm0
2780 ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2781 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14
2782 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7]
2783 ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12
2784 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15]
2785 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2786 ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm2
2787 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm10
2788 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10
2789 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11
2790 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
2791 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6>
2792 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10
2793 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
2794 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10
2795 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
2796 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7
2797 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15
2798 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7]
2799 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
2800 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13
2801 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13
2802 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
2803 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0
2804 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7]
2805 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0
2806 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
2807 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15
2808 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10
2809 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm12
2810 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10
2811 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
2812 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0
2813 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm9
2814 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
2815 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9
2816 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
2817 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
2818 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2819 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10
2820 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm8
2821 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8
2822 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
2823 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7]
2824 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2825 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7]
2826 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11
2827 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
2828 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11
2829 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15]
2830 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2831 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2832 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8
2833 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8
2834 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15]
2835 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2836 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2837 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
2838 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9
2839 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2840 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2841 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8
2842 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7]
2843 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
2844 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2845 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
2846 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4>
2847 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8
2848 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23>
2849 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8
2850 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
2851 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2852 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
2853 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
2854 ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1]
2855 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11
2856 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11
2857 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15]
2858 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
2859 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2860 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6
2861 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2862 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15]
2863 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13
2864 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7]
2865 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1
2866 ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
2867 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
2868 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm11
2869 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2
2870 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
2871 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2
2872 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
2873 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
2874 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2875 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2876 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
2877 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
2878 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
2879 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
2880 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7
2881 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
2882 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2
2883 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
2884 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2
2885 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
2886 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1
2887 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
2888 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
2889 ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
2890 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm8
2891 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
2892 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8
2893 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
2894 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2895 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2896 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15]
2897 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8
2898 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0
2899 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2900 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0
2901 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
2902 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9
2903 ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14
2904 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1
2905 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1
2906 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
2907 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1
2908 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1
2909 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
2910 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2911 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15]
2912 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2913 ; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2914 ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
2915 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
2916 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
2917 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
2918 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
2919 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
2920 ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
2921 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
2922 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
2923 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1
2924 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
2925 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
2926 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm6
2927 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
2928 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6
2929 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
2930 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
2931 ; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
2932 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
2933 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
2934 ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3
2935 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
2936 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2
2937 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2
2938 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
2939 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1
2940 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1
2941 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2942 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2943 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi)
2944 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2945 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi)
2946 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2947 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx)
2948 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2949 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx)
2950 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2951 ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx)
2952 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2953 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx)
2954 ; AVX2-FAST-NEXT: vmovdqa %ymm15, 32(%r8)
2955 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2956 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8)
2957 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9)
2958 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9)
2959 ; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128
2960 ; AVX2-FAST-NEXT: vzeroupper
2961 ; AVX2-FAST-NEXT: retq
2963 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32:
2964 ; AVX2-FAST-PERLANE: # %bb.0:
2965 ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108
2966 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13
2967 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6
2968 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7
2969 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10
2970 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2
2971 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3
2972 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1
2973 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4
2974 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15]
2975 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5
2976 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2977 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4
2978 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2979 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
2980 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
2981 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0
2982 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
2983 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2984 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
2985 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
2986 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
2987 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8
2988 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
2989 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0
2990 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15]
2991 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
2992 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7]
2993 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1
2994 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2995 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
2996 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12
2997 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7]
2998 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8
2999 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12
3000 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3001 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
3002 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
3003 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
3004 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1
3005 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
3006 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5
3007 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm14
3008 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7]
3009 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
3010 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9
3011 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1
3012 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15]
3013 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3014 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1]
3015 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
3016 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15
3017 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
3018 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9
3019 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
3020 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8
3021 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14
3022 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9
3023 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
3024 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3]
3025 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
3026 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14
3027 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3028 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15]
3029 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
3030 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3031 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6
3032 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm15
3033 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
3034 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2
3035 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3036 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15]
3037 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7]
3038 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3039 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3]
3040 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
3041 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2
3042 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3043 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3044 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3045 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3046 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3]
3047 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1
3048 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3049 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
3050 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
3051 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3052 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3053 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3054 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
3055 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3056 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
3057 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
3058 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3059 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15]
3060 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11
3061 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
3062 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
3063 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3064 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
3065 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2
3066 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3067 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3]
3068 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
3069 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2
3070 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3071 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3072 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3073 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3074 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15]
3075 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10
3076 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3077 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
3078 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3079 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3080 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15]
3081 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11
3082 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
3083 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2
3084 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3085 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3]
3086 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2
3087 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3088 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3089 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3090 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3091 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
3092 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3093 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
3094 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
3095 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11
3096 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3]
3097 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
3098 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3099 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
3100 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2
3101 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
3102 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3]
3103 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
3104 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2
3105 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3106 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
3107 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3108 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5
3109 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3110 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
3111 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1]
3112 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7]
3113 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3114 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4
3115 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15]
3116 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14
3117 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
3118 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11
3119 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7]
3120 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3]
3121 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3
3122 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3123 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
3124 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3125 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
3126 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm3
3127 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
3128 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm12
3129 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
3130 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
3131 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3132 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
3133 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15]
3134 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5
3135 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
3136 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
3137 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1]
3138 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6
3139 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
3140 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4
3141 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
3142 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3143 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3144 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3145 ; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3146 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
3147 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3148 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7]
3149 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4
3150 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
3151 ; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
3152 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
3153 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6
3154 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
3155 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5
3156 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
3157 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm5
3158 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0
3159 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3160 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3161 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
3162 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3163 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi)
3164 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3165 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi)
3166 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3167 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx)
3168 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3169 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx)
3170 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3171 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx)
3172 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3173 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx)
3174 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8)
3175 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8)
3176 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9)
3177 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9)
3178 ; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108
3179 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
3180 ; AVX2-FAST-PERLANE-NEXT: retq
3182 ; AVX512F-SLOW-LABEL: load_i16_stride5_vf32:
3183 ; AVX512F-SLOW: # %bb.0:
3184 ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0
3185 ; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1
3186 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
3187 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
3188 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
3189 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
3190 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3191 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7
3192 ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm8
3193 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
3194 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
3195 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6,7]
3196 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
3197 ; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm9
3198 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3]
3199 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
3200 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10
3201 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
3202 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
3203 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3204 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
3205 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm2[5,6,7]
3206 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5
3207 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6
3208 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2
3209 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3
3210 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
3211 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3212 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
3213 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
3214 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
3215 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
3216 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
3217 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
3218 ; AVX512F-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm13
3219 ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm12
3220 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm11
3221 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0],xmm12[1],xmm11[2,3]
3222 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
3223 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15
3224 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
3225 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm15
3226 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm16
3227 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
3228 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
3229 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5,6,7]
3230 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
3231 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15
3232 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3]
3233 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3234 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1]
3235 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5,6,7]
3236 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
3237 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
3238 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
3239 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
3240 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
3241 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3242 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5,6,7]
3243 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
3244 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
3245 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
3246 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
3247 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
3248 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14
3249 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6],xmm14[7]
3250 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u]
3251 ; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm14, %ymm13
3252 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
3253 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
3254 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3255 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14
3256 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm18
3257 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
3258 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14
3259 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6,7]
3260 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
3261 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,1,3]
3262 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7]
3263 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm9[2],xmm14[3],xmm9[3]
3264 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm4[3,4,5,6,7]
3265 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
3266 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3267 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15
3268 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
3269 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
3270 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3271 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7]
3272 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
3273 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15
3274 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
3275 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3276 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
3277 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1]
3278 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
3279 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
3280 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
3281 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0],xmm11[1],xmm12[2,3]
3282 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
3283 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3284 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14
3285 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17
3286 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
3287 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13
3288 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4,5,6,7]
3289 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
3290 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm9[0],xmm10[1],xmm9[2,3]
3291 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
3292 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm4[3,4,5,6,7]
3293 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
3294 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
3295 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
3296 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3297 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4
3298 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
3299 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15
3300 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
3301 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3302 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
3303 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1]
3304 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7]
3305 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
3306 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
3307 ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13
3308 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm4
3309 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
3310 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15
3311 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
3312 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
3313 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3314 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm14[1,2,3,4,5,6,7],ymm4[8],ymm14[9,10,11,12,13,14,15]
3315 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
3316 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4
3317 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
3318 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8
3319 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3,4,5,6,7]
3320 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
3321 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm10[2],xmm9[3]
3322 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
3323 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
3324 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3325 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[3,1,2,3]
3326 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
3327 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
3328 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
3329 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
3330 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3331 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
3332 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
3333 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
3334 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
3335 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
3336 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
3337 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
3338 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
3339 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
3340 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
3341 ; AVX512F-SLOW-NEXT: movb $7, %al
3342 ; AVX512F-SLOW-NEXT: kmovw %eax, %k1
3343 ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm7 {%k1}
3344 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm2
3345 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
3346 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
3347 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
3348 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
3349 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3350 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
3351 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3352 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
3353 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi)
3354 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rdx)
3355 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx)
3356 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%r8)
3357 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9)
3358 ; AVX512F-SLOW-NEXT: vzeroupper
3359 ; AVX512F-SLOW-NEXT: retq
3361 ; AVX512F-FAST-LABEL: load_i16_stride5_vf32:
3362 ; AVX512F-FAST: # %bb.0:
3363 ; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm2
3364 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
3365 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm3
3366 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
3367 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3368 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4
3369 ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5
3370 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
3371 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,4,7,1,4,6,u,u>
3372 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1
3373 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
3374 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,3,2,4,u,u,u>
3375 ; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
3376 ; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm0
3377 ; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm1
3378 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
3379 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8
3380 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
3381 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
3382 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3383 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
3384 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm10
3385 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm11
3386 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm8
3387 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm9
3388 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
3389 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,u,u,u,4,6,1,3>
3390 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm7
3391 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
3392 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
3393 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13
3394 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
3395 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
3396 ; AVX512F-FAST-NEXT: vpor %ymm7, %ymm12, %ymm12
3397 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,1,u,0,3,5,u>
3398 ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm7
3399 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm14
3400 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
3401 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14
3402 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
3403 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm14
3404 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm16
3405 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
3406 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,u,u,u,4,7,1,6>
3407 ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12
3408 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
3409 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
3410 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6
3411 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6],xmm6[7]
3412 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u]
3413 ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6
3414 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,3,2,u,1,3,6,u>
3415 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm12
3416 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
3417 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12
3418 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm19
3419 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm15, %zmm12
3420 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm2, %xmm6
3421 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
3422 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
3423 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
3424 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <0,2,5,7,4,7,u,u>
3425 ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14
3426 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
3427 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm14[3,4,5,6,7]
3428 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
3429 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
3430 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
3431 ; AVX512F-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
3432 ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14
3433 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
3434 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7]
3435 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm17
3436 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3437 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14
3438 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7]
3439 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
3440 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3441 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
3442 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3]
3443 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
3444 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,3,5,2,5,7,u,u>
3445 ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm18, %ymm12
3446 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
3447 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm12[3,4,5,6,7]
3448 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
3449 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
3450 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
3451 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
3452 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4],xmm12[5,6,7]
3453 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
3454 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
3455 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,2,u,u,5,7,2,4>
3456 ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm18, %ymm14
3457 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
3458 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
3459 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0]
3460 ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1]
3461 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14
3462 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm13
3463 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm13
3464 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm13
3465 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
3466 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12
3467 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
3468 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
3469 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
3470 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,u,u,5,0,2,7>
3471 ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12
3472 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
3473 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
3474 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
3475 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
3476 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
3477 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,3,6,0,5,u,u,u>
3478 ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14
3479 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
3480 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5,6,7]
3481 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
3482 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0]
3483 ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1]
3484 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14
3485 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm15
3486 ; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14
3487 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12
3488 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12
3489 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm12, %ymm6
3490 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
3491 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15
3492 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
3493 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
3494 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3495 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm14[1,2,3,4,5,6,7],ymm6[8],ymm14[9,10,11,12,13,14,15]
3496 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
3497 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6
3498 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
3499 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11
3500 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
3501 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
3502 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
3503 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,u,u,6,0,3,5>
3504 ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8
3505 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
3506 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
3507 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
3508 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
3509 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
3510 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u>
3511 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
3512 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
3513 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
3514 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3515 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
3516 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm3
3517 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
3518 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
3519 ; AVX512F-FAST-NEXT: movb $7, %al
3520 ; AVX512F-FAST-NEXT: kmovw %eax, %k1
3521 ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
3522 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm3
3523 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
3524 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
3525 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
3526 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
3527 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3528 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
3529 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
3530 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
3531 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rsi)
3532 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rdx)
3533 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, (%rcx)
3534 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r8)
3535 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9)
3536 ; AVX512F-FAST-NEXT: vzeroupper
3537 ; AVX512F-FAST-NEXT: retq
3539 ; AVX512BW-LABEL: load_i16_stride5_vf32:
3540 ; AVX512BW: # %bb.0:
3541 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0
3542 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
3543 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
3544 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3
3545 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4
3546 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
3547 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3548 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
3549 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
3550 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
3551 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
3552 ; AVX512BW-NEXT: kmovd %eax, %k1
3553 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1}
3554 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
3555 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5
3556 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
3557 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3558 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6
3559 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
3560 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
3561 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1}
3562 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
3563 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6
3564 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
3565 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3566 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7
3567 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
3568 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8
3569 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1}
3570 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
3571 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7
3572 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
3573 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
3574 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8
3575 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
3576 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9
3577 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
3578 ; AVX512BW-NEXT: kmovd %eax, %k1
3579 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1}
3580 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
3581 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8
3582 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
3583 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
3584 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
3585 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
3586 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1
3587 ; AVX512BW-NEXT: movb $7, %al
3588 ; AVX512BW-NEXT: kmovd %eax, %k1
3589 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
3590 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
3591 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3592 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi)
3593 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx)
3594 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx)
3595 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8)
3596 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9)
3597 ; AVX512BW-NEXT: vzeroupper
3598 ; AVX512BW-NEXT: retq
3599 %wide.vec = load <160 x i16>, ptr %in.vec, align 64
3600 %strided.vec0 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155>
3601 %strided.vec1 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156>
3602 %strided.vec2 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157>
3603 %strided.vec3 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158>
3604 %strided.vec4 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159>
3605 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
3606 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
3607 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
3608 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
3609 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
3613 define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
3614 ; SSE-LABEL: load_i16_stride5_vf64:
3616 ; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8
3617 ; SSE-NEXT: movdqa 464(%rdi), %xmm5
3618 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3619 ; SSE-NEXT: movdqa 400(%rdi), %xmm8
3620 ; SSE-NEXT: movdqa 416(%rdi), %xmm11
3621 ; SSE-NEXT: movdqa 448(%rdi), %xmm4
3622 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3623 ; SSE-NEXT: movdqa 432(%rdi), %xmm7
3624 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3625 ; SSE-NEXT: movdqa 144(%rdi), %xmm6
3626 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627 ; SSE-NEXT: movdqa 80(%rdi), %xmm15
3628 ; SSE-NEXT: movdqa 96(%rdi), %xmm10
3629 ; SSE-NEXT: movdqa 128(%rdi), %xmm14
3630 ; SSE-NEXT: movdqa 112(%rdi), %xmm2
3631 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3632 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
3633 ; SSE-NEXT: movdqa %xmm0, %xmm1
3634 ; SSE-NEXT: pandn %xmm2, %xmm1
3635 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3]
3636 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3637 ; SSE-NEXT: pand %xmm0, %xmm2
3638 ; SSE-NEXT: por %xmm1, %xmm2
3639 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
3640 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3641 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3642 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3]
3643 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
3644 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3645 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3]
3646 ; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
3647 ; SSE-NEXT: andps %xmm13, %xmm3
3648 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1]
3649 ; SSE-NEXT: movaps %xmm13, %xmm2
3650 ; SSE-NEXT: pandn %xmm1, %xmm2
3651 ; SSE-NEXT: por %xmm3, %xmm2
3652 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3653 ; SSE-NEXT: movdqa %xmm0, %xmm1
3654 ; SSE-NEXT: pandn %xmm7, %xmm1
3655 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
3656 ; SSE-NEXT: pand %xmm0, %xmm2
3657 ; SSE-NEXT: por %xmm1, %xmm2
3658 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
3659 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3660 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3661 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3]
3662 ; SSE-NEXT: movdqa %xmm8, %xmm6
3663 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3664 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7]
3665 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3666 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
3667 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
3668 ; SSE-NEXT: movaps %xmm13, %xmm2
3669 ; SSE-NEXT: andnps %xmm1, %xmm2
3670 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
3671 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3672 ; SSE-NEXT: andps %xmm13, %xmm4
3673 ; SSE-NEXT: orps %xmm4, %xmm2
3674 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3675 ; SSE-NEXT: movdqa %xmm0, %xmm1
3676 ; SSE-NEXT: pandn %xmm3, %xmm1
3677 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
3678 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3679 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
3680 ; SSE-NEXT: pand %xmm0, %xmm2
3681 ; SSE-NEXT: por %xmm1, %xmm2
3682 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
3683 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
3685 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3686 ; SSE-NEXT: movdqa (%rdi), %xmm5
3687 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
3688 ; SSE-NEXT: movdqa %xmm5, %xmm9
3689 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3690 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3691 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3692 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
3693 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
3694 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3695 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3696 ; SSE-NEXT: movaps %xmm13, %xmm2
3697 ; SSE-NEXT: andnps %xmm1, %xmm2
3698 ; SSE-NEXT: andps %xmm13, %xmm4
3699 ; SSE-NEXT: orps %xmm4, %xmm2
3700 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3701 ; SSE-NEXT: movdqa 352(%rdi), %xmm2
3702 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3703 ; SSE-NEXT: movdqa %xmm0, %xmm1
3704 ; SSE-NEXT: pandn %xmm2, %xmm1
3705 ; SSE-NEXT: movdqa 368(%rdi), %xmm2
3706 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3707 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
3708 ; SSE-NEXT: pand %xmm0, %xmm2
3709 ; SSE-NEXT: por %xmm1, %xmm2
3710 ; SSE-NEXT: movdqa 336(%rdi), %xmm1
3711 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
3712 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
3713 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3714 ; SSE-NEXT: movdqa 320(%rdi), %xmm7
3715 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
3716 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3717 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3718 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3719 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
3720 ; SSE-NEXT: movdqa 384(%rdi), %xmm1
3721 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3722 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3723 ; SSE-NEXT: movaps %xmm13, %xmm2
3724 ; SSE-NEXT: andnps %xmm1, %xmm2
3725 ; SSE-NEXT: andps %xmm13, %xmm4
3726 ; SSE-NEXT: orps %xmm4, %xmm2
3727 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3728 ; SSE-NEXT: movdqa 272(%rdi), %xmm2
3729 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3730 ; SSE-NEXT: movdqa %xmm0, %xmm1
3731 ; SSE-NEXT: pandn %xmm2, %xmm1
3732 ; SSE-NEXT: movdqa 288(%rdi), %xmm2
3733 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3734 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
3735 ; SSE-NEXT: pand %xmm0, %xmm2
3736 ; SSE-NEXT: por %xmm1, %xmm2
3737 ; SSE-NEXT: movdqa 256(%rdi), %xmm12
3738 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
3739 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3740 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3741 ; SSE-NEXT: movdqa 240(%rdi), %xmm3
3742 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3743 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
3744 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3745 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3746 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
3747 ; SSE-NEXT: movdqa 304(%rdi), %xmm1
3748 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3749 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3750 ; SSE-NEXT: movaps %xmm13, %xmm2
3751 ; SSE-NEXT: andnps %xmm1, %xmm2
3752 ; SSE-NEXT: andps %xmm13, %xmm4
3753 ; SSE-NEXT: orps %xmm4, %xmm2
3754 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3755 ; SSE-NEXT: movdqa 592(%rdi), %xmm2
3756 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757 ; SSE-NEXT: movdqa %xmm0, %xmm1
3758 ; SSE-NEXT: pandn %xmm2, %xmm1
3759 ; SSE-NEXT: movdqa 608(%rdi), %xmm2
3760 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3761 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
3762 ; SSE-NEXT: pand %xmm0, %xmm2
3763 ; SSE-NEXT: por %xmm1, %xmm2
3764 ; SSE-NEXT: movdqa 576(%rdi), %xmm1
3765 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3766 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
3767 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3768 ; SSE-NEXT: movdqa 560(%rdi), %xmm3
3769 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
3770 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3771 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3772 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3773 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
3774 ; SSE-NEXT: movdqa 624(%rdi), %xmm1
3775 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3776 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3777 ; SSE-NEXT: movaps %xmm13, %xmm2
3778 ; SSE-NEXT: andnps %xmm1, %xmm2
3779 ; SSE-NEXT: andps %xmm13, %xmm4
3780 ; SSE-NEXT: orps %xmm4, %xmm2
3781 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3782 ; SSE-NEXT: movdqa 192(%rdi), %xmm2
3783 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3784 ; SSE-NEXT: movdqa %xmm0, %xmm1
3785 ; SSE-NEXT: pandn %xmm2, %xmm1
3786 ; SSE-NEXT: movdqa 208(%rdi), %xmm2
3787 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3788 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3]
3789 ; SSE-NEXT: pand %xmm0, %xmm4
3790 ; SSE-NEXT: por %xmm1, %xmm4
3791 ; SSE-NEXT: movdqa 176(%rdi), %xmm1
3792 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3793 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
3794 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3795 ; SSE-NEXT: movdqa 160(%rdi), %xmm2
3796 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3797 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
3798 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
3799 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
3800 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3]
3801 ; SSE-NEXT: movdqa 224(%rdi), %xmm1
3802 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3803 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3804 ; SSE-NEXT: movaps %xmm13, %xmm4
3805 ; SSE-NEXT: andnps %xmm1, %xmm4
3806 ; SSE-NEXT: andps %xmm13, %xmm5
3807 ; SSE-NEXT: orps %xmm5, %xmm4
3808 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3809 ; SSE-NEXT: movdqa 528(%rdi), %xmm1
3810 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3811 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
3812 ; SSE-NEXT: pand %xmm0, %xmm1
3813 ; SSE-NEXT: movdqa 512(%rdi), %xmm2
3814 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3815 ; SSE-NEXT: pandn %xmm2, %xmm0
3816 ; SSE-NEXT: por %xmm1, %xmm0
3817 ; SSE-NEXT: movdqa 496(%rdi), %xmm1
3818 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3819 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
3820 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3821 ; SSE-NEXT: movdqa 480(%rdi), %xmm2
3822 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3823 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
3824 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
3825 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3826 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3]
3827 ; SSE-NEXT: movdqa 544(%rdi), %xmm0
3828 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3829 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3830 ; SSE-NEXT: movaps %xmm13, %xmm1
3831 ; SSE-NEXT: andnps %xmm0, %xmm1
3832 ; SSE-NEXT: andps %xmm13, %xmm4
3833 ; SSE-NEXT: orps %xmm4, %xmm1
3834 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3835 ; SSE-NEXT: psrlq $48, %xmm10
3836 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3]
3838 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
3839 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
3840 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535]
3841 ; SSE-NEXT: movdqa %xmm0, %xmm4
3842 ; SSE-NEXT: pandn %xmm1, %xmm4
3843 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3]
3844 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3845 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3846 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
3847 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7]
3848 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3849 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
3850 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
3851 ; SSE-NEXT: pand %xmm0, %xmm1
3852 ; SSE-NEXT: por %xmm4, %xmm1
3853 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3854 ; SSE-NEXT: movdqa %xmm8, %xmm4
3855 ; SSE-NEXT: psllq $48, %xmm4
3856 ; SSE-NEXT: movaps %xmm13, %xmm2
3857 ; SSE-NEXT: andnps %xmm4, %xmm2
3858 ; SSE-NEXT: pand %xmm13, %xmm1
3859 ; SSE-NEXT: orps %xmm1, %xmm2
3860 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3861 ; SSE-NEXT: psrlq $48, %xmm11
3862 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
3863 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3864 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
3865 ; SSE-NEXT: movdqa %xmm0, %xmm1
3866 ; SSE-NEXT: pandn %xmm4, %xmm1
3867 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3868 ; SSE-NEXT: # xmm4 = mem[1,3,2,3]
3869 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3870 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3871 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3872 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
3873 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
3874 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3875 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
3876 ; SSE-NEXT: pand %xmm0, %xmm4
3877 ; SSE-NEXT: por %xmm1, %xmm4
3878 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3879 ; SSE-NEXT: psllq $48, %xmm1
3880 ; SSE-NEXT: movdqa %xmm13, %xmm2
3881 ; SSE-NEXT: pandn %xmm1, %xmm2
3882 ; SSE-NEXT: pand %xmm13, %xmm4
3883 ; SSE-NEXT: por %xmm4, %xmm2
3884 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3885 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3886 ; SSE-NEXT: psrlq $48, %xmm1
3887 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3]
3888 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3889 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3890 ; SSE-NEXT: movdqa %xmm0, %xmm1
3891 ; SSE-NEXT: pandn %xmm4, %xmm1
3892 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3893 ; SSE-NEXT: # xmm4 = mem[1,3,2,3]
3894 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3895 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3896 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3897 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
3898 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
3899 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3900 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
3901 ; SSE-NEXT: pand %xmm0, %xmm4
3902 ; SSE-NEXT: por %xmm1, %xmm4
3903 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3904 ; SSE-NEXT: psllq $48, %xmm1
3905 ; SSE-NEXT: movdqa %xmm13, %xmm2
3906 ; SSE-NEXT: pandn %xmm1, %xmm2
3907 ; SSE-NEXT: pand %xmm13, %xmm4
3908 ; SSE-NEXT: por %xmm4, %xmm2
3909 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3910 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
3911 ; SSE-NEXT: psrlq $48, %xmm1
3912 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3]
3913 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3914 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3915 ; SSE-NEXT: movdqa %xmm0, %xmm1
3916 ; SSE-NEXT: pandn %xmm4, %xmm1
3917 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3918 ; SSE-NEXT: # xmm4 = mem[1,3,2,3]
3919 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3920 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3921 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3922 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
3923 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
3924 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3925 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
3926 ; SSE-NEXT: pand %xmm0, %xmm4
3927 ; SSE-NEXT: por %xmm1, %xmm4
3928 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3929 ; SSE-NEXT: psllq $48, %xmm1
3930 ; SSE-NEXT: movdqa %xmm13, %xmm2
3931 ; SSE-NEXT: pandn %xmm1, %xmm2
3932 ; SSE-NEXT: pand %xmm13, %xmm4
3933 ; SSE-NEXT: por %xmm4, %xmm2
3934 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3935 ; SSE-NEXT: psrlq $48, %xmm12
3936 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3937 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3]
3938 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3939 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
3940 ; SSE-NEXT: movdqa %xmm0, %xmm1
3941 ; SSE-NEXT: pandn %xmm4, %xmm1
3942 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3943 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3]
3944 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3945 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3946 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3947 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
3948 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
3949 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3950 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
3951 ; SSE-NEXT: pand %xmm0, %xmm4
3952 ; SSE-NEXT: por %xmm1, %xmm4
3953 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3954 ; SSE-NEXT: movdqa %xmm12, %xmm1
3955 ; SSE-NEXT: psllq $48, %xmm1
3956 ; SSE-NEXT: movdqa %xmm13, %xmm2
3957 ; SSE-NEXT: pandn %xmm1, %xmm2
3958 ; SSE-NEXT: pand %xmm13, %xmm4
3959 ; SSE-NEXT: por %xmm4, %xmm2
3960 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3961 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3962 ; SSE-NEXT: psrlq $48, %xmm1
3963 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
3964 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3965 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3966 ; SSE-NEXT: movdqa %xmm0, %xmm1
3967 ; SSE-NEXT: pandn %xmm4, %xmm1
3968 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3969 ; SSE-NEXT: # xmm4 = mem[1,3,2,3]
3970 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3971 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
3972 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3973 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
3974 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
3975 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
3976 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
3977 ; SSE-NEXT: pand %xmm0, %xmm4
3978 ; SSE-NEXT: por %xmm1, %xmm4
3979 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3980 ; SSE-NEXT: psllq $48, %xmm1
3981 ; SSE-NEXT: movdqa %xmm13, %xmm2
3982 ; SSE-NEXT: pandn %xmm1, %xmm2
3983 ; SSE-NEXT: pand %xmm13, %xmm4
3984 ; SSE-NEXT: por %xmm4, %xmm2
3985 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3986 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3987 ; SSE-NEXT: movdqa %xmm7, %xmm1
3988 ; SSE-NEXT: psrlq $48, %xmm1
3989 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3990 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
3991 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
3992 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3993 ; SSE-NEXT: movdqa %xmm0, %xmm1
3994 ; SSE-NEXT: pandn %xmm4, %xmm1
3995 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3996 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
3997 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3998 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
3999 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4000 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
4001 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4002 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
4003 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7]
4004 ; SSE-NEXT: pand %xmm0, %xmm4
4005 ; SSE-NEXT: por %xmm1, %xmm4
4006 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4007 ; SSE-NEXT: movdqa %xmm3, %xmm1
4008 ; SSE-NEXT: psllq $48, %xmm1
4009 ; SSE-NEXT: movdqa %xmm13, %xmm5
4010 ; SSE-NEXT: pandn %xmm1, %xmm5
4011 ; SSE-NEXT: pand %xmm13, %xmm4
4012 ; SSE-NEXT: por %xmm4, %xmm5
4013 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4014 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4015 ; SSE-NEXT: psrlq $48, %xmm1
4016 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4017 ; SSE-NEXT: # xmm4 = mem[0,3,2,3]
4018 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
4019 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4020 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4021 ; SSE-NEXT: # xmm1 = mem[1,3,2,3]
4022 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4023 ; SSE-NEXT: # xmm5 = mem[0,2,2,3]
4024 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4025 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7]
4026 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4027 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
4028 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
4029 ; SSE-NEXT: pand %xmm0, %xmm1
4030 ; SSE-NEXT: pandn %xmm4, %xmm0
4031 ; SSE-NEXT: por %xmm1, %xmm0
4032 ; SSE-NEXT: pand %xmm13, %xmm0
4033 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4034 ; SSE-NEXT: psllq $48, %xmm1
4035 ; SSE-NEXT: pandn %xmm1, %xmm13
4036 ; SSE-NEXT: por %xmm0, %xmm13
4037 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4038 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4039 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4040 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
4041 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
4042 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4043 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
4044 ; SSE-NEXT: movaps %xmm6, %xmm4
4045 ; SSE-NEXT: andnps %xmm1, %xmm4
4046 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3]
4047 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
4048 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4049 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4050 ; SSE-NEXT: pand %xmm6, %xmm5
4051 ; SSE-NEXT: por %xmm4, %xmm5
4052 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4053 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0]
4054 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4055 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4056 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
4057 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4058 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4059 ; SSE-NEXT: movaps %xmm0, %xmm1
4060 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4061 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0]
4062 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3]
4063 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4064 ; SSE-NEXT: movaps %xmm6, %xmm4
4065 ; SSE-NEXT: andnps %xmm1, %xmm4
4066 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4067 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3]
4068 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
4069 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4070 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
4071 ; SSE-NEXT: pand %xmm6, %xmm5
4072 ; SSE-NEXT: por %xmm4, %xmm5
4073 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4074 ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4075 ; SSE-NEXT: # xmm4 = mem[0,1,2,0]
4076 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4077 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4078 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
4079 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4080 ; SSE-NEXT: movdqa %xmm10, %xmm1
4081 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4082 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0]
4083 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3]
4084 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4085 ; SSE-NEXT: movaps %xmm6, %xmm4
4086 ; SSE-NEXT: andnps %xmm1, %xmm4
4087 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3]
4088 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
4089 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4090 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4091 ; SSE-NEXT: pand %xmm6, %xmm5
4092 ; SSE-NEXT: por %xmm4, %xmm5
4093 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4094 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0]
4095 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4096 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4097 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0]
4098 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4099 ; SSE-NEXT: movdqa %xmm2, %xmm1
4100 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0]
4101 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3]
4102 ; SSE-NEXT: movdqa %xmm9, %xmm12
4103 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4104 ; SSE-NEXT: movaps %xmm6, %xmm4
4105 ; SSE-NEXT: andnps %xmm1, %xmm4
4106 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3]
4107 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
4108 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
4109 ; SSE-NEXT: pand %xmm6, %xmm2
4110 ; SSE-NEXT: por %xmm4, %xmm2
4111 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4112 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0]
4113 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4114 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4115 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
4116 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4118 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4119 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
4120 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
4121 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4122 ; SSE-NEXT: movaps %xmm6, %xmm4
4123 ; SSE-NEXT: andnps %xmm1, %xmm4
4124 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4125 ; SSE-NEXT: # xmm5 = mem[0,1,1,3]
4126 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
4127 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4128 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4129 ; SSE-NEXT: pand %xmm6, %xmm2
4130 ; SSE-NEXT: por %xmm4, %xmm2
4131 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4132 ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4133 ; SSE-NEXT: # xmm4 = mem[0,1,2,0]
4134 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4135 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4136 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
4137 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4138 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4139 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4140 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0]
4141 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3]
4142 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4143 ; SSE-NEXT: movaps %xmm6, %xmm4
4144 ; SSE-NEXT: andnps %xmm1, %xmm4
4145 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4146 ; SSE-NEXT: # xmm5 = mem[0,1,1,3]
4147 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
4148 ; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload
4149 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
4150 ; SSE-NEXT: pand %xmm6, %xmm2
4151 ; SSE-NEXT: por %xmm4, %xmm2
4152 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4153 ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4154 ; SSE-NEXT: # xmm4 = mem[0,1,2,0]
4155 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4156 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4157 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
4158 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4159 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4160 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4161 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
4162 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
4163 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4164 ; SSE-NEXT: movaps %xmm6, %xmm4
4165 ; SSE-NEXT: andnps %xmm1, %xmm4
4166 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4167 ; SSE-NEXT: # xmm5 = mem[0,1,1,3]
4168 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
4169 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4170 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4171 ; SSE-NEXT: pand %xmm6, %xmm2
4172 ; SSE-NEXT: por %xmm4, %xmm2
4173 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4174 ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4175 ; SSE-NEXT: # xmm4 = mem[0,1,2,0]
4176 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4177 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4178 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
4179 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4180 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4181 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4182 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0]
4183 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3]
4184 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
4185 ; SSE-NEXT: movaps %xmm6, %xmm4
4186 ; SSE-NEXT: andnps %xmm1, %xmm4
4187 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4188 ; SSE-NEXT: # xmm5 = mem[0,1,1,3]
4189 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
4190 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4191 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4192 ; SSE-NEXT: pand %xmm6, %xmm2
4193 ; SSE-NEXT: por %xmm4, %xmm2
4194 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4195 ; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4196 ; SSE-NEXT: # xmm4 = mem[0,1,2,0]
4197 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
4198 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3]
4199 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
4200 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4201 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
4202 ; SSE-NEXT: movdqa %xmm6, %xmm4
4203 ; SSE-NEXT: pandn %xmm1, %xmm4
4204 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
4205 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3]
4206 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4207 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7]
4208 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4209 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4210 ; SSE-NEXT: pand %xmm6, %xmm1
4211 ; SSE-NEXT: por %xmm4, %xmm1
4212 ; SSE-NEXT: movdqa %xmm1, %xmm2
4213 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0]
4214 ; SSE-NEXT: movaps %xmm6, %xmm3
4215 ; SSE-NEXT: andnps %xmm13, %xmm3
4216 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4217 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2]
4218 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7]
4219 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4220 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
4221 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
4222 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
4223 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0]
4224 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4225 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4226 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7]
4227 ; SSE-NEXT: movdqa %xmm6, %xmm3
4228 ; SSE-NEXT: pandn %xmm1, %xmm3
4229 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4230 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
4231 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4232 ; SSE-NEXT: # xmm4 = mem[0,2,2,3]
4233 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4234 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
4235 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4236 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4237 ; SSE-NEXT: pand %xmm6, %xmm1
4238 ; SSE-NEXT: por %xmm3, %xmm1
4239 ; SSE-NEXT: movdqa %xmm1, %xmm4
4240 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4241 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0]
4242 ; SSE-NEXT: movaps %xmm6, %xmm2
4243 ; SSE-NEXT: andnps %xmm14, %xmm2
4244 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4245 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2]
4246 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7]
4247 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4248 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
4249 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
4250 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
4251 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
4252 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4253 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7]
4254 ; SSE-NEXT: movdqa %xmm6, %xmm3
4255 ; SSE-NEXT: pandn %xmm1, %xmm3
4256 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
4257 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4258 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
4259 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4260 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
4261 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4262 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4263 ; SSE-NEXT: pand %xmm6, %xmm1
4264 ; SSE-NEXT: por %xmm3, %xmm1
4265 ; SSE-NEXT: movdqa %xmm1, %xmm3
4266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4267 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0]
4268 ; SSE-NEXT: movaps %xmm6, %xmm2
4269 ; SSE-NEXT: andnps %xmm12, %xmm2
4270 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4271 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2]
4272 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7]
4273 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4274 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
4275 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
4276 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
4277 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
4278 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4279 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4280 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
4281 ; SSE-NEXT: movdqa %xmm6, %xmm2
4282 ; SSE-NEXT: pandn %xmm1, %xmm2
4283 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4284 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
4285 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4286 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
4287 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4288 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
4289 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4290 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4291 ; SSE-NEXT: pand %xmm6, %xmm1
4292 ; SSE-NEXT: por %xmm2, %xmm1
4293 ; SSE-NEXT: movdqa %xmm1, %xmm3
4294 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4295 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0]
4296 ; SSE-NEXT: movaps %xmm4, %xmm2
4297 ; SSE-NEXT: movaps %xmm6, %xmm4
4298 ; SSE-NEXT: andnps %xmm2, %xmm4
4299 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4300 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4301 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7]
4302 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4303 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
4304 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
4305 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
4306 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
4307 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4308 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7]
4309 ; SSE-NEXT: movdqa %xmm6, %xmm2
4310 ; SSE-NEXT: pandn %xmm1, %xmm2
4311 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4312 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
4313 ; SSE-NEXT: movdqa %xmm9, %xmm11
4314 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
4315 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4316 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
4317 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4318 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7]
4319 ; SSE-NEXT: pand %xmm6, %xmm0
4320 ; SSE-NEXT: por %xmm2, %xmm0
4321 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4322 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0]
4323 ; SSE-NEXT: movaps %xmm6, %xmm2
4324 ; SSE-NEXT: andnps %xmm7, %xmm2
4325 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4326 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2]
4327 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7]
4328 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4329 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
4330 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
4331 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
4332 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
4333 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4334 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4335 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
4336 ; SSE-NEXT: movdqa %xmm6, %xmm2
4337 ; SSE-NEXT: pandn %xmm1, %xmm2
4338 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4339 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
4340 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4341 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
4342 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4343 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
4344 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4345 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7]
4346 ; SSE-NEXT: pand %xmm6, %xmm0
4347 ; SSE-NEXT: por %xmm2, %xmm0
4348 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4349 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0]
4350 ; SSE-NEXT: movaps %xmm4, %xmm2
4351 ; SSE-NEXT: movaps %xmm6, %xmm3
4352 ; SSE-NEXT: andnps %xmm4, %xmm3
4353 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4354 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4355 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7]
4356 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4357 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
4358 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
4359 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
4360 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
4361 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4362 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7]
4363 ; SSE-NEXT: movdqa %xmm6, %xmm2
4364 ; SSE-NEXT: pandn %xmm1, %xmm2
4365 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4366 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
4367 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4368 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
4369 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4370 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
4371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4372 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7]
4373 ; SSE-NEXT: pand %xmm6, %xmm14
4374 ; SSE-NEXT: por %xmm2, %xmm14
4375 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4376 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0]
4377 ; SSE-NEXT: movaps %xmm6, %xmm0
4378 ; SSE-NEXT: andnps %xmm10, %xmm0
4379 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4380 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2]
4381 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7]
4382 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4383 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
4384 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
4385 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
4386 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0]
4387 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4388 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7]
4389 ; SSE-NEXT: movdqa %xmm6, %xmm2
4390 ; SSE-NEXT: pandn %xmm1, %xmm2
4391 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4392 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
4393 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4394 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
4395 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4396 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7]
4397 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4398 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7]
4399 ; SSE-NEXT: pand %xmm6, %xmm10
4400 ; SSE-NEXT: por %xmm2, %xmm10
4401 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4402 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4403 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4404 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4405 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4406 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4407 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
4408 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
4409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4410 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0]
4411 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
4412 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4413 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0]
4414 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2]
4415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4416 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0]
4417 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
4418 ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill
4419 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0]
4420 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2]
4421 ; SSE-NEXT: movdqa %xmm7, %xmm1
4422 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0]
4423 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2]
4424 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4425 ; SSE-NEXT: movdqa %xmm4, %xmm1
4426 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0]
4427 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2]
4428 ; SSE-NEXT: movaps %xmm1, %xmm15
4429 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4430 ; SSE-NEXT: movaps %xmm4, %xmm1
4431 ; SSE-NEXT: movaps %xmm4, %xmm12
4432 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
4433 ; SSE-NEXT: movaps %xmm0, %xmm11
4434 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
4435 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4436 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
4437 ; SSE-NEXT: pand %xmm6, %xmm8
4438 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
4439 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4440 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
4441 ; SSE-NEXT: pand %xmm6, %xmm7
4442 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7]
4443 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4444 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7]
4445 ; SSE-NEXT: pand %xmm6, %xmm0
4446 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
4447 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4448 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7]
4449 ; SSE-NEXT: pand %xmm6, %xmm5
4450 ; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload
4451 ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7]
4452 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4453 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7]
4454 ; SSE-NEXT: pand %xmm6, %xmm4
4455 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7]
4456 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4457 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
4458 ; SSE-NEXT: pand %xmm6, %xmm3
4459 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4460 ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7]
4461 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4462 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
4463 ; SSE-NEXT: pand %xmm6, %xmm2
4464 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7]
4465 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
4466 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7]
4467 ; SSE-NEXT: pand %xmm6, %xmm9
4468 ; SSE-NEXT: andnps %xmm11, %xmm6
4469 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
4470 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7]
4471 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4472 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3]
4473 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
4474 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3]
4475 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0]
4476 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4477 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4478 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4479 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4480 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4481 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4482 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4483 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4484 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0]
4485 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4486 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4487 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4488 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4489 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4490 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4491 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4492 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4493 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0]
4494 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4495 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4496 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4497 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4498 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4499 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4500 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4501 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4502 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0]
4503 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4504 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4505 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4506 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4507 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4508 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4509 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4510 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4511 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0]
4512 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4513 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4514 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4515 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4516 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4517 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4518 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4519 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4520 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0]
4521 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4522 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4523 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4524 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4525 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4526 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4527 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4528 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4529 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0]
4530 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4531 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4532 ; SSE-NEXT: # xmm1 = mem[0,2,2,3]
4533 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4534 ; SSE-NEXT: # xmm15 = mem[0,1,1,3]
4535 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4536 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
4537 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3]
4538 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0]
4539 ; SSE-NEXT: orps %xmm9, %xmm6
4540 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3]
4541 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3]
4542 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
4543 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
4544 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3]
4545 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0]
4546 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4547 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
4548 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4549 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
4550 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4551 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
4552 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4553 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
4554 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4555 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
4556 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4557 ; SSE-NEXT: movaps %xmm1, (%rsi)
4558 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4559 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
4560 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4561 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
4562 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4563 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
4564 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4565 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
4566 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4567 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
4568 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4569 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
4570 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4571 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
4572 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4573 ; SSE-NEXT: movaps %xmm1, (%rdx)
4574 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4575 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
4576 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4577 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
4578 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4579 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
4580 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4581 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
4582 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4583 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
4584 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4585 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
4586 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4587 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
4588 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4589 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
4590 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4591 ; SSE-NEXT: movaps %xmm1, (%rcx)
4592 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4593 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
4594 ; SSE-NEXT: movaps %xmm10, 112(%r8)
4595 ; SSE-NEXT: movaps %xmm14, 96(%r8)
4596 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4597 ; SSE-NEXT: movaps %xmm1, 80(%r8)
4598 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4599 ; SSE-NEXT: movaps %xmm1, 64(%r8)
4600 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4601 ; SSE-NEXT: movaps %xmm1, 48(%r8)
4602 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4603 ; SSE-NEXT: movaps %xmm1, 32(%r8)
4604 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4605 ; SSE-NEXT: movaps %xmm1, 16(%r8)
4606 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4607 ; SSE-NEXT: movaps %xmm1, (%r8)
4608 ; SSE-NEXT: movaps %xmm6, 112(%r9)
4609 ; SSE-NEXT: movaps %xmm2, 96(%r9)
4610 ; SSE-NEXT: movaps %xmm3, 80(%r9)
4611 ; SSE-NEXT: movaps %xmm4, 64(%r9)
4612 ; SSE-NEXT: movaps %xmm5, 48(%r9)
4613 ; SSE-NEXT: movaps %xmm0, 32(%r9)
4614 ; SSE-NEXT: movaps %xmm7, 16(%r9)
4615 ; SSE-NEXT: movaps %xmm8, (%r9)
4616 ; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8
4619 ; AVX1-ONLY-LABEL: load_i16_stride5_vf64:
4620 ; AVX1-ONLY: # %bb.0:
4621 ; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408
4622 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0
4623 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4624 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1
4625 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4626 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
4627 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
4628 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4629 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2
4630 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4631 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
4632 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4633 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15
4634 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1]
4635 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4636 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3
4637 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4638 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
4639 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4640 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
4641 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
4642 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0
4643 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4644 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4645 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3
4646 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4647 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
4648 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3
4649 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
4650 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
4651 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
4652 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4
4653 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4654 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
4655 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4656 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4657 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4658 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
4659 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
4660 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0
4661 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4662 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
4663 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4
4664 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
4665 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
4666 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4667 ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0
4668 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4670 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4671 ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12
4672 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1]
4673 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4674 ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0
4675 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4676 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
4677 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4678 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
4679 ; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3
4680 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4681 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0
4682 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4683 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
4684 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
4685 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
4686 ; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0
4687 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4688 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
4689 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
4690 ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9
4691 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
4692 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4693 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4694 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4695 ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0
4696 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4697 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
4698 ; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13
4699 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7]
4700 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4701 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
4702 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
4703 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0
4704 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
4705 ; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm11
4706 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4707 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4
4708 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
4709 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
4710 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4711 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10
4712 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
4713 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4714 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4715 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
4716 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4717 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4718 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
4719 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4720 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
4721 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4722 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
4723 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7
4724 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6
4725 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
4726 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4727 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4728 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
4729 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
4730 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0
4731 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4732 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
4733 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
4734 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
4735 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4736 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
4737 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4738 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4739 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0
4740 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4741 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4
4742 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4743 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4744 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7]
4745 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
4746 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
4747 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0
4748 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4749 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
4750 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4
4751 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
4752 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
4753 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4754 ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8
4755 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0
4756 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4757 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
4758 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4759 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
4760 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0
4761 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4762 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4763 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4764 ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0
4765 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4766 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4767 ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0
4768 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4769 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
4770 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4771 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
4772 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
4773 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0
4774 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4775 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
4776 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
4777 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0
4778 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4779 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
4780 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
4781 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4782 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0
4783 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4784 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
4785 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0
4786 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4787 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7]
4788 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
4789 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2
4790 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0
4791 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4792 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1]
4793 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3
4794 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
4795 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0
4796 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4797 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
4798 ; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
4799 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7]
4800 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
4801 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4802 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3]
4803 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
4804 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
4805 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4806 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
4807 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
4808 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
4809 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
4810 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
4811 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4812 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3]
4813 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
4814 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
4815 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15
4816 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
4817 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4818 ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4819 ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
4820 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
4821 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4822 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7]
4823 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0
4824 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4825 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm4, %xmm15
4826 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15
4827 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0
4828 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
4829 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4830 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
4831 ; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
4832 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
4833 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4834 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4835 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3]
4836 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
4837 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
4838 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4839 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
4840 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
4841 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
4842 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
4843 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3]
4844 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
4845 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4846 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm15
4847 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
4848 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4849 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7]
4850 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15
4851 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
4852 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
4853 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm15
4854 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15
4855 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3
4856 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
4857 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4858 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4859 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7]
4860 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
4861 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4862 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4863 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3]
4864 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
4865 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
4866 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
4867 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
4868 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
4869 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4870 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
4871 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
4872 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4873 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15
4874 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
4875 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4876 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4877 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
4878 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15
4879 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
4880 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3
4881 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4882 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm7, %xmm15
4883 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15
4884 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3
4885 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
4886 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4887 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
4888 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7]
4889 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0
4890 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4891 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload
4892 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
4893 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7]
4894 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4895 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4896 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3]
4897 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
4898 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
4899 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
4900 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4901 ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4902 ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7]
4903 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
4904 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4905 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3]
4906 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
4907 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4908 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3
4909 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4910 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7]
4911 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1
4912 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4913 ; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm2
4914 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2
4915 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
4916 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
4917 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4918 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4919 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4920 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
4921 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
4922 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
4923 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4924 ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3]
4925 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
4926 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
4927 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4928 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
4929 ; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
4930 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
4931 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
4932 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
4933 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4934 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
4935 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
4936 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
4937 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
4938 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4939 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3]
4940 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
4941 ; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload
4942 ; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
4943 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
4944 ; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4945 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0]
4946 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
4947 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
4948 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
4949 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4950 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4951 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4952 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
4953 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
4954 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4955 ; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3]
4956 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
4957 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
4958 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
4959 ; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
4960 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
4961 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
4962 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
4963 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7]
4964 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
4965 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4966 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3]
4967 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
4968 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4969 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
4970 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
4971 ; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4972 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0]
4973 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
4974 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
4975 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
4976 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4977 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
4978 ; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
4979 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3
4980 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
4981 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
4982 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
4983 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4984 ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4985 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7]
4986 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
4987 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
4988 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
4989 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5
4990 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3]
4991 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
4992 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3]
4993 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
4994 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0]
4995 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
4996 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
4997 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
4998 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4999 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5000 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5001 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
5002 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
5003 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5004 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
5005 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0
5006 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5007 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
5008 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
5009 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
5010 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
5011 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5012 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5013 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
5014 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
5015 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5016 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3]
5017 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
5018 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5019 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5020 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
5021 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5022 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0]
5023 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
5024 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
5025 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
5026 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5027 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5028 ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5029 ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
5030 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
5031 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5032 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2
5033 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
5034 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5035 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
5036 ; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
5037 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
5038 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
5039 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
5040 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
5041 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
5042 ; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
5043 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
5044 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5045 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
5046 ; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
5047 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
5048 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
5049 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
5050 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5051 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3]
5052 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
5053 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
5054 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
5055 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5056 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5057 ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5058 ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
5059 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
5060 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
5061 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5062 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5
5063 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
5064 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5065 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
5066 ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
5067 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
5068 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
5069 ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
5070 ; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
5071 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
5072 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
5073 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5074 ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
5075 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
5076 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
5077 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
5078 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
5079 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5080 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3]
5081 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
5082 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
5083 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
5084 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5085 ; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
5086 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7]
5087 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3
5088 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm5
5089 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
5090 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
5091 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
5092 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
5093 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
5094 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
5095 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4
5096 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
5097 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
5098 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
5099 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
5100 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
5101 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
5102 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
5103 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
5104 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5105 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5106 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
5107 ; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
5108 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
5109 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5110 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5111 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
5112 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0
5113 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5114 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3
5115 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
5116 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
5117 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5118 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5119 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
5120 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1
5121 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5122 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5123 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7]
5124 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
5125 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
5126 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7]
5127 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5128 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
5129 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
5130 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
5131 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
5132 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5133 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5134 ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3]
5135 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
5136 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5137 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,2,2,3]
5138 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5139 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5140 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5141 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3]
5142 ; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5143 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
5144 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5145 ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3]
5146 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
5147 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5148 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
5149 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5150 ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7]
5151 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5152 ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1]
5153 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5154 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3]
5155 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
5156 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
5157 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
5158 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5159 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7]
5160 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5161 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3]
5162 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
5163 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
5164 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
5165 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5166 ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3]
5167 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
5168 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5169 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3]
5170 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
5171 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
5172 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5173 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3]
5174 ; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5175 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
5176 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5177 ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3]
5178 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7]
5179 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
5180 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5181 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5182 ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7]
5183 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5184 ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1]
5185 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5186 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3]
5187 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
5188 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
5189 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
5190 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7]
5191 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3]
5192 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
5193 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
5194 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
5195 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3]
5196 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
5197 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5198 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3]
5199 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
5200 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
5201 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3]
5202 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7]
5203 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
5204 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7]
5205 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7]
5206 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
5207 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1]
5208 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3]
5209 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
5210 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
5211 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12
5212 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7]
5213 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3]
5214 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
5215 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
5216 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12
5217 ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5218 ; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3]
5219 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
5220 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5221 ; AVX1-ONLY-NEXT: # xmm13 = mem[0,2,2,3]
5222 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
5223 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
5224 ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5225 ; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3]
5226 ; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
5227 ; AVX1-ONLY-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7]
5228 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
5229 ; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3]
5230 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7]
5231 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7]
5232 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5233 ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload
5234 ; AVX1-ONLY-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
5235 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2
5236 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5237 ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1]
5238 ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5239 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3]
5240 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5241 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
5242 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
5243 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5244 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3]
5245 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
5246 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
5247 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
5248 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5249 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi)
5250 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5251 ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi)
5252 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5253 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi)
5254 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5255 ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi)
5256 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5257 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx)
5258 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5259 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx)
5260 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5261 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx)
5262 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5263 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx)
5264 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5265 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx)
5266 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5267 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx)
5268 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5269 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx)
5270 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5271 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx)
5272 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5273 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8)
5274 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5275 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8)
5276 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5277 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8)
5278 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5279 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8)
5280 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9)
5281 ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9)
5282 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9)
5283 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9)
5284 ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408
5285 ; AVX1-ONLY-NEXT: vzeroupper
5286 ; AVX1-ONLY-NEXT: retq
5288 ; AVX2-SLOW-LABEL: load_i16_stride5_vf64:
5289 ; AVX2-SLOW: # %bb.0:
5290 ; AVX2-SLOW-NEXT: subq $1048, %rsp # imm = 0x418
5291 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10
5292 ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4
5293 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14
5294 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7
5295 ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm8
5296 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5297 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3
5298 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5299 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5
5300 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5301 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0
5302 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5303 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1
5304 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5305 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
5306 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5307 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
5308 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
5309 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2
5310 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
5311 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3
5312 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
5313 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
5314 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3
5315 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
5316 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
5317 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5318 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
5319 ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11
5320 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5321 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
5322 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
5323 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
5324 ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5325 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8
5326 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5327 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
5328 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
5329 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13
5330 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
5331 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3
5332 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
5333 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5334 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15]
5335 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5336 ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5337 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
5338 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
5339 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4
5340 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6
5341 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
5342 ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15
5343 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5344 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6
5345 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5346 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
5347 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
5348 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2
5349 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3
5350 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
5351 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5352 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9
5353 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4
5354 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15]
5355 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12
5356 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5357 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5358 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
5359 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
5360 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1
5361 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5
5362 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7
5363 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15]
5364 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5365 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5366 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
5367 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
5368 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0
5369 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
5370 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5371 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5372 ; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
5373 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5374 ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
5375 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5376 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
5377 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5378 ; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5379 ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
5380 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
5381 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
5382 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
5383 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1
5384 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
5385 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5386 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
5387 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5388 ; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
5389 ; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
5390 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
5391 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
5392 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
5393 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
5394 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7]
5395 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1
5396 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5397 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11
5398 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15]
5399 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
5400 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
5401 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15]
5402 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
5403 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
5404 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1
5405 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8
5406 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6
5407 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15]
5408 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
5409 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
5410 ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0
5411 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
5412 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10
5413 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
5414 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4
5415 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2
5416 ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm9
5417 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3
5418 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3]
5419 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
5420 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8
5421 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5422 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5423 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
5424 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
5425 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5426 ; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5
5427 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm12
5428 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3]
5429 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15
5430 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5431 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10
5432 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5433 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5434 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15]
5435 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
5436 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5437 ; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5
5438 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4
5439 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3]
5440 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10
5441 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5442 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8
5443 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13
5444 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
5445 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5446 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15]
5447 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
5448 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5449 ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5
5450 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4
5451 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3]
5452 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5453 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5454 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0
5455 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5456 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5457 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
5458 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
5459 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5460 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14
5461 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5462 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3]
5463 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5464 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
5465 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5466 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5467 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5468 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
5469 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5470 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5471 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3]
5472 ; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5473 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5474 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5475 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
5476 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
5477 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5478 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
5479 ; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10
5480 ; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5481 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5482 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5483 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
5484 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5485 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5486 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3]
5487 ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0
5488 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5489 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
5490 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5491 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5492 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5493 ; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
5494 ; AVX2-SLOW-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
5495 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5496 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
5497 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5498 ; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
5499 ; AVX2-SLOW-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
5500 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5501 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
5502 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
5503 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3
5504 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
5505 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
5506 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
5507 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3]
5508 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
5509 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5510 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5511 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5512 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5513 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5514 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5515 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
5516 ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
5517 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5518 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
5519 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5520 ; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
5521 ; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
5522 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
5523 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
5524 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3
5525 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
5526 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
5527 ; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload
5528 ; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3]
5529 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5530 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5531 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5532 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5533 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5534 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5535 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5536 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
5537 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5538 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
5539 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5540 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5541 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
5542 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
5543 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
5544 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3
5545 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
5546 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
5547 ; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload
5548 ; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3]
5549 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5550 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5551 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5552 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5553 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5554 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5555 ; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
5556 ; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
5557 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5558 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
5559 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
5560 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5561 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5562 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
5563 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
5564 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7]
5565 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
5566 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5567 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5568 ; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5569 ; AVX2-SLOW-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3]
5570 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
5571 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5572 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
5573 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5574 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5575 ; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
5576 ; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
5577 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5578 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
5579 ; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
5580 ; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15]
5581 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
5582 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
5583 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
5584 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3
5585 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
5586 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
5587 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
5588 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5589 ; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload
5590 ; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3]
5591 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
5592 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5593 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5594 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5595 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5596 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5597 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5598 ; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
5599 ; AVX2-SLOW-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15]
5600 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5601 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
5602 ; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
5603 ; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15]
5604 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
5605 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
5606 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3
5607 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
5608 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
5609 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5610 ; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload
5611 ; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3]
5612 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5613 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5614 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5615 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5616 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5617 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
5618 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5619 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
5620 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
5621 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
5622 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
5623 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3
5624 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
5625 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
5626 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5627 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5628 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3]
5629 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6
5630 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5631 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5632 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5633 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5634 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5635 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15]
5636 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
5637 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
5638 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2
5639 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
5640 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6
5641 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3]
5642 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
5643 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
5644 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5645 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5646 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3]
5647 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
5648 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5649 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
5650 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5651 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5652 ; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
5653 ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
5654 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5655 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7]
5656 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5657 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5658 ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
5659 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
5660 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
5661 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
5662 ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1]
5663 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0
5664 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
5665 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
5666 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
5667 ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5668 ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3]
5669 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
5670 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
5671 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
5672 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5673 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5674 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
5675 ; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
5676 ; AVX2-SLOW-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
5677 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
5678 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
5679 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5680 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5681 ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
5682 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
5683 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
5684 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2
5685 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
5686 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
5687 ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5688 ; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3]
5689 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
5690 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
5691 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
5692 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5693 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5694 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
5695 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5696 ; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5697 ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
5698 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
5699 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
5700 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5701 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
5702 ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
5703 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6
5704 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
5705 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4
5706 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
5707 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
5708 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
5709 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
5710 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3]
5711 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
5712 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
5713 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
5714 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
5715 ; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
5716 ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15]
5717 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
5718 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7]
5719 ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5
5720 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5721 ; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5722 ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15]
5723 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7
5724 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
5725 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
5726 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
5727 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
5728 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
5729 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
5730 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
5731 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
5732 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
5733 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
5734 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5735 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi)
5736 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5737 ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi)
5738 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5739 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi)
5740 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5741 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi)
5742 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5743 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx)
5744 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5745 ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx)
5746 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5747 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx)
5748 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5749 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx)
5750 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5751 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx)
5752 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5753 ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx)
5754 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5755 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx)
5756 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5757 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx)
5758 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5759 ; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8)
5760 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8)
5761 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5762 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8)
5763 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5764 ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8)
5765 ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9)
5766 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9)
5767 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9)
5768 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9)
5769 ; AVX2-SLOW-NEXT: addq $1048, %rsp # imm = 0x418
5770 ; AVX2-SLOW-NEXT: vzeroupper
5771 ; AVX2-SLOW-NEXT: retq
5773 ; AVX2-FAST-LABEL: load_i16_stride5_vf64:
5774 ; AVX2-FAST: # %bb.0:
5775 ; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8
5776 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9
5777 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5778 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6
5779 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm11
5780 ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm8
5781 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5782 ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10
5783 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5784 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4
5785 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5786 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3
5787 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5788 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0
5789 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5790 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1
5791 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5792 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
5793 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5794 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
5795 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
5796 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
5797 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15]
5798 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3]
5799 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4
5800 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
5801 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4
5802 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
5803 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
5804 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5805 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
5806 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
5807 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
5808 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2
5809 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
5810 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5811 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14
5812 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5813 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4
5814 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4
5815 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
5816 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5817 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4
5818 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15]
5819 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15
5820 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5821 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
5822 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
5823 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8
5824 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12
5825 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15]
5826 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5827 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5828 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4
5829 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4
5830 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2
5831 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
5832 ; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
5833 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4
5834 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7
5835 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
5836 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5837 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9
5838 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5839 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
5840 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
5841 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
5842 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5
5843 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6
5844 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
5845 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10
5846 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5847 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5848 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1
5849 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
5850 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
5851 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5852 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5853 ; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5854 ; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15]
5855 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5856 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
5857 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
5858 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5859 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5860 ; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5861 ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
5862 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6>
5863 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3
5864 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
5865 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
5866 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6
5867 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5868 ; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5869 ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
5870 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3
5871 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7]
5872 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5873 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15]
5874 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3
5875 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3
5876 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3
5877 ; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
5878 ; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
5879 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11
5880 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7]
5881 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15]
5882 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11
5883 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm11
5884 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5885 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0
5886 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
5887 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
5888 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7]
5889 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4
5890 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15]
5891 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2
5892 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1
5893 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1
5894 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2
5895 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7]
5896 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11
5897 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12
5898 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
5899 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11
5900 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5901 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
5902 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
5903 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5904 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm2
5905 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11
5906 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10
5907 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11
5908 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5909 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
5910 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
5911 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5912 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2
5913 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11
5914 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9
5915 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11
5916 ; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload
5917 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
5918 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
5919 ; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
5920 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2
5921 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5
5922 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4
5923 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5924 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
5925 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14
5926 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5927 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
5928 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5929 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5930 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7]
5931 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm11
5932 ; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8
5933 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5934 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
5935 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11
5936 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15]
5937 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
5938 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5939 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6
5940 ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12
5941 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5942 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6
5943 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
5944 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7]
5945 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5946 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3
5947 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5
5948 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5949 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
5950 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
5951 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5952 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5953 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm0
5954 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0
5955 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
5956 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5957 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5958 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5959 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5960 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15]
5961 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
5962 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
5963 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
5964 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6
5965 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5966 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
5967 ; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
5968 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4>
5969 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm7
5970 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23>
5971 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7
5972 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7]
5973 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0]
5974 ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
5975 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm11
5976 ; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8
5977 ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11
5978 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
5979 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
5980 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5981 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5982 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5983 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15]
5984 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11
5985 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
5986 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7
5987 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5988 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
5989 ; AVX2-FAST-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
5990 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11
5991 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11
5992 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
5993 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11
5994 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm11
5995 ; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13
5996 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
5997 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
5998 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5999 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6000 ; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
6001 ; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15]
6002 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11
6003 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
6004 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6005 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6006 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15]
6007 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11
6008 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11
6009 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7
6010 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
6011 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm11
6012 ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11
6013 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
6014 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7]
6015 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6016 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6017 ; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
6018 ; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15]
6019 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11
6020 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
6021 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0
6022 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6023 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
6024 ; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
6025 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1
6026 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
6027 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
6028 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
6029 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
6030 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6031 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6032 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6033 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15]
6034 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
6035 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
6036 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
6037 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6038 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6039 ; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
6040 ; AVX2-FAST-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15]
6041 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7>
6042 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm6
6043 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
6044 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6
6045 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
6046 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
6047 ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1]
6048 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
6049 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
6050 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7
6051 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
6052 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
6053 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6054 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
6055 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6056 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
6057 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6058 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6059 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
6060 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7
6061 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7
6062 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
6063 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
6064 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7
6065 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
6066 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
6067 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6068 ; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
6069 ; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15]
6070 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6071 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
6072 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
6073 ; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12
6074 ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7
6075 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7
6076 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6077 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
6078 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6079 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm7
6080 ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7
6081 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2
6082 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
6083 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
6084 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6085 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6086 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15]
6087 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7
6088 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
6089 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6090 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6091 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
6092 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1
6093 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1
6094 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
6095 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6096 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm1
6097 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
6098 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6099 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6100 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6101 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6102 ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
6103 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
6104 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
6105 ; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
6106 ; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
6107 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
6108 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
6109 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5>
6110 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2
6111 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
6112 ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1]
6113 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2
6114 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
6115 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7]
6116 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
6117 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
6118 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3
6119 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6120 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6121 ; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6122 ; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15]
6123 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11
6124 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7]
6125 ; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
6126 ; AVX2-FAST-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
6127 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3
6128 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11
6129 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11
6130 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7]
6131 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
6132 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9
6133 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
6134 ; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
6135 ; AVX2-FAST-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15]
6136 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6137 ; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6138 ; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
6139 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12
6140 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
6141 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm9
6142 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9
6143 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11
6144 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
6145 ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8
6146 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8
6147 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
6148 ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
6149 ; AVX2-FAST-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
6150 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11
6151 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7]
6152 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6
6153 ; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload
6154 ; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
6155 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5
6156 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5
6157 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
6158 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0
6159 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0
6160 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
6161 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6162 ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi)
6163 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
6164 ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi)
6165 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6166 ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi)
6167 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6168 ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi)
6169 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6170 ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx)
6171 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6172 ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx)
6173 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6174 ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx)
6175 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6176 ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx)
6177 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6178 ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx)
6179 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6180 ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx)
6181 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6182 ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx)
6183 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6184 ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx)
6185 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8)
6186 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6187 ; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8)
6188 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6189 ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8)
6190 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6191 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8)
6192 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9)
6193 ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9)
6194 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r9)
6195 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9)
6196 ; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8
6197 ; AVX2-FAST-NEXT: vzeroupper
6198 ; AVX2-FAST-NEXT: retq
6200 ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64:
6201 ; AVX2-FAST-PERLANE: # %bb.0:
6202 ; AVX2-FAST-PERLANE-NEXT: subq $1080, %rsp # imm = 0x438
6203 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm13
6204 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm5
6205 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6206 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6
6207 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6208 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7
6209 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6210 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8
6211 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2
6212 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6213 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4
6214 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1
6215 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6216 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0
6217 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6218 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15]
6219 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6220 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
6221 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
6222 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3
6223 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
6224 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15
6225 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6226 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
6227 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7]
6228 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
6229 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4
6230 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
6231 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
6232 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
6233 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
6234 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6235 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
6236 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
6237 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
6238 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6
6239 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
6240 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm14
6241 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3
6242 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
6243 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3
6244 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7
6245 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6246 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15]
6247 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6248 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6249 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
6250 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
6251 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2
6252 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5
6253 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15]
6254 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11
6255 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6256 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12
6257 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6258 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6
6259 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
6260 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3
6261 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4
6262 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
6263 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6264 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9
6265 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10
6266 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
6267 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6268 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6269 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
6270 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7]
6271 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1
6272 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4
6273 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5
6274 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
6275 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6276 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6277 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6
6278 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7]
6279 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0
6280 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2
6281 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
6282 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6283 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6284 ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6285 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
6286 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6287 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6288 ; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
6289 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
6290 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3
6291 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
6292 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
6293 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0
6294 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
6295 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1
6296 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
6297 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6298 ; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
6299 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
6300 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6301 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6302 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6303 ; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
6304 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
6305 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8
6306 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7]
6307 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0
6308 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1
6309 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3
6310 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
6311 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
6312 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
6313 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
6314 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
6315 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
6316 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0
6317 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8
6318 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1
6319 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2
6320 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
6321 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1]
6322 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
6323 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7
6324 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
6325 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11
6326 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
6327 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6
6328 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12
6329 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1
6330 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0
6331 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3]
6332 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9
6333 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
6334 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7
6335 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6336 ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
6337 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
6338 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
6339 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6340 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm4
6341 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm5
6342 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3]
6343 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8
6344 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10
6345 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7
6346 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6347 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6348 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
6349 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
6350 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6351 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm4
6352 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm5
6353 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3]
6354 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6355 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6
6356 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7
6357 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6358 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6359 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
6360 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
6361 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6362 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4
6363 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm7
6364 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3]
6365 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6366 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm11
6367 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6368 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0
6369 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6370 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6371 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
6372 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6373 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6374 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm4
6375 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6376 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14
6377 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6378 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3]
6379 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
6380 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0
6381 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6382 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6383 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
6384 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6385 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6386 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
6387 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13
6388 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6389 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm10
6390 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
6391 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0
6392 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6393 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
6394 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6395 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6396 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3]
6397 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3
6398 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6399 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0
6400 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6401 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
6402 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6403 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6404 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3]
6405 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0
6406 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6407 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
6408 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
6409 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6410 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6411 ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
6412 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
6413 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6414 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
6415 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6416 ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
6417 ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
6418 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2
6419 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
6420 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
6421 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm2
6422 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11
6423 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
6424 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
6425 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
6426 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3]
6427 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
6428 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6429 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6430 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6431 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6432 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6433 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6434 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6435 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
6436 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6437 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
6438 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6439 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
6440 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
6441 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
6442 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2
6443 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9
6444 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
6445 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3]
6446 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6447 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6448 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6449 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6450 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6451 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6452 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6453 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
6454 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6455 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
6456 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6457 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6458 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
6459 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
6460 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
6461 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
6462 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6463 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9
6464 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
6465 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6466 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3]
6467 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6468 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6469 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6470 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6471 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6472 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6473 ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6474 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15]
6475 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6476 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
6477 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2
6478 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6479 ; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6480 ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
6481 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9
6482 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7]
6483 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0
6484 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
6485 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6486 ; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6487 ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3]
6488 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1
6489 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6490 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6491 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6492 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6493 ; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
6494 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15]
6495 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6496 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
6497 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6498 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
6499 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2
6500 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
6501 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
6502 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2
6503 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
6504 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1
6505 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
6506 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6507 ; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
6508 ; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3]
6509 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
6510 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6511 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6512 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6513 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6514 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6515 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
6516 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6517 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
6518 ; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
6519 ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15]
6520 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
6521 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
6522 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6523 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9
6524 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
6525 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6526 ; AVX2-FAST-PERLANE-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload
6527 ; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3]
6528 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6529 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6530 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6531 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6532 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6533 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
6534 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6535 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
6536 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15]
6537 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12
6538 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
6539 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6540 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9
6541 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
6542 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6543 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3]
6544 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9
6545 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6546 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
6547 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
6548 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6549 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6550 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6551 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15]
6552 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
6553 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
6554 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6555 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6556 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6557 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
6558 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9
6559 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3]
6560 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0
6561 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
6562 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6563 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6564 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3]
6565 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1
6566 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6567 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
6568 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6569 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6570 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6571 ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6572 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
6573 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6574 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
6575 ; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
6576 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
6577 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
6578 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
6579 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
6580 ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1]
6581 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1
6582 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
6583 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2
6584 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
6585 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
6586 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6587 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm4
6588 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
6589 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6590 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6
6591 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
6592 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6593 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7]
6594 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6595 ; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6596 ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
6597 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
6598 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
6599 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6600 ; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6601 ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
6602 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14
6603 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7]
6604 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6605 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4
6606 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
6607 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6608 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4
6609 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
6610 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8
6611 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
6612 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6613 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
6614 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm4
6615 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5
6616 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6617 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15]
6618 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
6619 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
6620 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
6621 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8
6622 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
6623 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5
6624 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7
6625 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
6626 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6627 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
6628 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6629 ; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
6630 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
6631 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
6632 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
6633 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0
6634 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6635 ; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
6636 ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
6637 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7
6638 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7]
6639 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3
6640 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
6641 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm3
6642 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6643 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1
6644 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
6645 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6646 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6647 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6648 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi)
6649 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6650 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
6651 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6652 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi)
6653 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6654 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi)
6655 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6656 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx)
6657 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6658 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx)
6659 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6660 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx)
6661 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6662 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx)
6663 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6664 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx)
6665 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6666 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx)
6667 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6668 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx)
6669 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6670 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx)
6671 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6672 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8)
6673 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6674 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8)
6675 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6676 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8)
6677 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6678 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8)
6679 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9)
6680 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9)
6681 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9)
6682 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9)
6683 ; AVX2-FAST-PERLANE-NEXT: addq $1080, %rsp # imm = 0x438
6684 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
6685 ; AVX2-FAST-PERLANE-NEXT: retq
6687 ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64:
6688 ; AVX512F-SLOW: # %bb.0:
6689 ; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248
6690 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1
6691 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2
6692 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15]
6693 ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6
6694 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6695 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7
6696 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6697 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6698 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
6699 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
6700 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
6701 ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3
6702 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4
6703 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
6704 ; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8
6705 ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6706 ; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm9
6707 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6708 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
6709 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
6710 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
6711 ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
6712 ; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19
6713 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12
6714 ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15
6715 ; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13
6716 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14
6717 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4
6718 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
6719 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
6720 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10
6721 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15]
6722 ; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11
6723 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6724 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
6725 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0
6726 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
6727 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6728 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
6729 ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6730 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0
6731 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6732 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
6733 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6734 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6735 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
6736 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2
6737 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
6738 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3
6739 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
6740 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
6741 ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3
6742 ; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2
6743 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6744 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
6745 ; AVX512F-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
6746 ; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6747 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
6748 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
6749 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0
6750 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
6751 ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6752 ; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6753 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6754 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
6755 ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6756 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0
6757 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6758 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
6759 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6760 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
6761 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
6762 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3]
6763 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26
6764 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm24
6765 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
6766 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
6767 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6768 ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2
6769 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3
6770 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3]
6771 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18
6772 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6
6773 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
6774 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6775 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6776 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
6777 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
6778 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
6779 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
6780 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6781 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
6782 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
6783 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
6784 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
6785 ; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3
6786 ; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4
6787 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
6788 ; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm5
6789 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3
6790 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
6791 ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
6792 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
6793 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6794 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
6795 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6796 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6797 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
6798 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6799 ; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13
6800 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3]
6801 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
6802 ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3
6803 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
6804 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7
6805 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
6806 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6807 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6808 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3
6809 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
6810 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
6811 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[0,2,2,3]
6812 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
6813 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6814 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6815 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm13[1],xmm7[2,3]
6816 ; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm10
6817 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
6818 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6819 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20
6820 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6821 ; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1
6822 ; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2
6823 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
6824 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17
6825 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16
6826 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6827 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
6828 ; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6
6829 ; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm2
6830 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15]
6831 ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11
6832 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
6833 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
6834 ; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm27
6835 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3]
6836 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
6837 ; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm28
6838 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm28[0,2,2,3]
6839 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
6840 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
6841 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23>
6842 ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6843 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
6844 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
6845 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8
6846 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6847 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7]
6848 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6849 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm7
6850 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
6851 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6852 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15]
6853 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7
6854 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7]
6855 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6856 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15]
6857 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
6858 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4,5,6,7]
6859 ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6860 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm24[3,1,2,3]
6861 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
6862 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[0,2,2,3]
6863 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
6864 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
6865 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7]
6866 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6867 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
6868 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1
6869 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3]
6870 ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7
6871 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3
6872 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
6873 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6874 ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload
6875 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
6876 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6877 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3]
6878 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm29
6879 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30
6880 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
6881 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6882 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31
6883 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6884 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13],ymm11[14],ymm6[15]
6885 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23
6886 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19
6887 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
6888 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5,6,7]
6889 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm28[0,3,2,3]
6890 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
6891 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm13
6892 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
6893 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
6894 ; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2
6895 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm2[3,4,5,6,7]
6896 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6897 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6
6898 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3
6899 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15]
6900 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14
6901 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3]
6902 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
6903 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8
6904 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6905 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7]
6906 ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
6907 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
6908 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6909 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
6910 ; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm10
6911 ; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm9
6912 ; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6913 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
6914 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7]
6915 ; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0
6916 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,3,2,3]
6917 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20
6918 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
6919 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm24, %xmm8
6920 ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
6921 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
6922 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6923 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15]
6924 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18
6925 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26
6926 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8
6927 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3]
6928 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2
6929 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6930 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6931 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm7[2],xmm1[3]
6932 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21
6933 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22
6934 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1
6935 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
6936 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6937 ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload
6938 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
6939 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6940 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6941 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6942 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5],ymm5[6],ymm11[7,8],ymm5[9],ymm11[10,11],ymm5[12],ymm11[13],ymm5[14],ymm11[15]
6943 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6944 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
6945 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
6946 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
6947 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1
6948 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2
6949 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15]
6950 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
6951 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4],xmm2[5,6,7]
6952 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
6953 ; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8
6954 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
6955 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2
6956 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7]
6957 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8
6958 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12
6959 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm8[1],xmm12[2,3]
6960 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
6961 ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13
6962 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6963 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm13
6964 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1
6965 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4
6966 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15]
6967 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14
6968 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
6969 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
6970 ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6971 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,1,3]
6972 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
6973 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm27[2],xmm15[3],xmm27[3]
6974 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3,4,5,6,7]
6975 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7]
6976 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
6977 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19
6978 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23
6979 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12
6980 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
6981 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
6982 ; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm12
6983 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6984 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7]
6985 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2
6986 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6987 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
6988 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31
6989 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12
6990 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4,5,6,7]
6991 ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6992 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[0,1,1,3]
6993 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7]
6994 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10
6995 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
6996 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[3,4,5,6,7]
6997 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7]
6998 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6999 ; AVX512F-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm12 # 32-byte Folded Reload
7000 ; AVX512F-SLOW-NEXT: # ymm12 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15]
7001 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
7002 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7]
7003 ; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7
7004 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7005 ; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload
7006 ; AVX512F-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15]
7007 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13
7008 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7]
7009 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0
7010 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
7011 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3
7012 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6
7013 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm6[1],xmm3[2,3]
7014 ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7
7015 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
7016 ; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm7
7017 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0
7018 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3
7019 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15]
7020 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm24
7021 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8
7022 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7]
7023 ; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0
7024 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7025 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
7026 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18
7027 ; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm12
7028 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1],ymm5[2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7,8],ymm11[9],ymm5[10],ymm11[11],ymm5[12,13],ymm11[14],ymm5[15]
7029 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7030 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
7031 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
7032 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14
7033 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15
7034 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
7035 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
7036 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3]
7037 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
7038 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7039 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm3
7040 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15]
7041 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
7042 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5,6,7]
7043 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
7044 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6
7045 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1
7046 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2,3]
7047 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
7048 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3,4,5,6,7]
7049 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
7050 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5
7051 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7
7052 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm5[2],xmm7[3]
7053 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
7054 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
7055 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2
7056 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
7057 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9
7058 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8
7059 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
7060 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7
7061 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4,5],xmm7[6,7]
7062 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
7063 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7064 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm7
7065 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
7066 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
7067 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21
7068 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
7069 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7070 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7]
7071 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
7072 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7
7073 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7]
7074 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
7075 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
7076 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
7077 ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11
7078 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
7079 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
7080 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
7081 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
7082 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3]
7083 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
7084 ; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4
7085 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
7086 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
7087 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
7088 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
7089 ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
7090 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1
7091 ; AVX512F-SLOW-NEXT: movb $7, %al
7092 ; AVX512F-SLOW-NEXT: kmovw %eax, %k1
7093 ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
7094 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
7095 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4
7096 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
7097 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
7098 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4
7099 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
7100 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm6
7101 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15]
7102 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
7103 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
7104 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm6
7105 ; AVX512F-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload
7106 ; AVX512F-SLOW-NEXT: # ymm4 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15]
7107 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6
7108 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4,5,6,7]
7109 ; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
7110 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4
7111 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3]
7112 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
7113 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
7114 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
7115 ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
7116 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
7117 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7118 ; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
7119 ; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15]
7120 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7121 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
7122 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2
7123 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7124 ; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
7125 ; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
7126 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5
7127 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
7128 ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4
7129 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
7130 ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1}
7131 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2
7132 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4
7133 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
7134 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
7135 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7]
7136 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0
7137 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2
7138 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7139 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
7140 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7141 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
7142 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7143 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi)
7144 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7145 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi)
7146 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7147 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx)
7148 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7149 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx)
7150 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7151 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx)
7152 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx)
7153 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8)
7154 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7155 ; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8)
7156 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9)
7157 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9)
7158 ; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248
7159 ; AVX512F-SLOW-NEXT: vzeroupper
7160 ; AVX512F-SLOW-NEXT: retq
7162 ; AVX512F-FAST-LABEL: load_i16_stride5_vf64:
7163 ; AVX512F-FAST: # %bb.0:
7164 ; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8
7165 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
7166 ; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1
7167 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7168 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1
7169 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
7170 ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm2
7171 ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7172 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
7173 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
7174 ; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm10
7175 ; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7176 ; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm11
7177 ; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7178 ; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm7
7179 ; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7180 ; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm8
7181 ; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7182 ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm1
7183 ; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
7184 ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm4
7185 ; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7186 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
7187 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4
7188 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
7189 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
7190 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4
7191 ; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm6
7192 ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5
7193 ; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7194 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
7195 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm27
7196 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
7197 ; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5
7198 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
7199 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5
7200 ; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4
7201 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
7202 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7
7203 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
7204 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
7205 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u>
7206 ; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7
7207 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23>
7208 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7
7209 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
7210 ; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm7
7211 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
7212 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5
7213 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
7214 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
7215 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
7216 ; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29
7217 ; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm7
7218 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
7219 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7
7220 ; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm14
7221 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
7222 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7
7223 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4
7224 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7225 ; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4
7226 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0
7227 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31
7228 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4
7229 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3
7230 ; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm15
7231 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
7232 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7
7233 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8
7234 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
7235 ; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7236 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3
7237 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3
7238 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5
7239 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6
7240 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
7241 ; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11
7242 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5
7243 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
7244 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1
7245 ; AVX512F-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
7246 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4
7247 ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5
7248 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
7249 ; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm13
7250 ; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5
7251 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3
7252 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3
7253 ; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm12
7254 ; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm9
7255 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15]
7256 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10
7257 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
7258 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2
7259 ; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm3
7260 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
7261 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
7262 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23
7263 ; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm2
7264 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2
7265 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm2
7266 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7267 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7268 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15]
7269 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm30
7270 ; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7271 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17
7272 ; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7273 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
7274 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
7275 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,7,1,6>
7276 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
7277 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22
7278 ; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7279 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1
7280 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
7281 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1
7282 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
7283 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
7284 ; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm18
7285 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm0
7286 ; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13]
7287 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1
7288 ; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm11
7289 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7290 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u>
7291 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
7292 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm19
7293 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm20, %ymm10
7294 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
7295 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10
7296 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4,5,6,7]
7297 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
7298 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
7299 ; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
7300 ; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7301 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
7302 ; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7303 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm0
7304 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
7305 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0
7306 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
7307 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
7308 ; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm25, %ymm10
7309 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
7310 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10
7311 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm26, %zmm10
7312 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0
7313 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7314 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
7315 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0
7316 ; AVX512F-FAST-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
7317 ; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15]
7318 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10
7319 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6],xmm10[7]
7320 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
7321 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13
7322 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7323 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15]
7324 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28
7325 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm21, %ymm3
7326 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3
7327 ; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0
7328 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
7329 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7330 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3
7331 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
7332 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm20, %ymm3
7333 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2
7334 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7335 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3
7336 ; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
7337 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm27, %xmm4
7338 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
7339 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
7340 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
7341 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7342 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7343 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15]
7344 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3
7345 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
7346 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
7347 ; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm25, %ymm2
7348 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2
7349 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2
7350 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm21
7351 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
7352 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7353 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
7354 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
7355 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
7356 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
7357 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20
7358 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
7359 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u>
7360 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12
7361 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
7362 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm26
7363 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm4
7364 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
7365 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4
7366 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7]
7367 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
7368 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
7369 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1
7370 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7371 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
7372 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0
7373 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1
7374 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
7375 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3
7376 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
7377 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4>
7378 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
7379 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0
7380 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1
7381 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
7382 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm3
7383 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
7384 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0
7385 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
7386 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5
7387 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
7388 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
7389 ; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1]
7390 ; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm14
7391 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
7392 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14
7393 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm14
7394 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0
7395 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7396 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15]
7397 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10
7398 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
7399 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0
7400 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6
7401 ; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm8
7402 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15]
7403 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm4
7404 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2
7405 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
7406 ; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm14
7407 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
7408 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15
7409 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
7410 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
7411 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7412 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
7413 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4
7414 ; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload
7415 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15]
7416 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30
7417 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm22
7418 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
7419 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
7420 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
7421 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm4
7422 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10,11],ymm4[12],ymm13[13],ymm4[14],ymm13[15]
7423 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm18
7424 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm3
7425 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1
7426 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
7427 ; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm2
7428 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2
7429 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm2
7430 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7431 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7432 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7
7433 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9
7434 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3]
7435 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
7436 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2
7437 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u>
7438 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10
7439 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15]
7440 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4
7441 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
7442 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4
7443 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
7444 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
7445 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
7446 ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1]
7447 ; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm5
7448 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
7449 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5
7450 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25
7451 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3]
7452 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm31
7453 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17
7454 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0
7455 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3],ymm8[4],ymm6[5,6],ymm8[7],ymm6[8,9],ymm8[10],ymm6[11],ymm8[12],ymm6[13,14],ymm8[15]
7456 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm20
7457 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
7458 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1
7459 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
7460 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7461 ; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1
7462 ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1
7463 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
7464 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm9[2],xmm7[3]
7465 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
7466 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
7467 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3
7468 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u>
7469 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1
7470 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
7471 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1
7472 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
7473 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
7474 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
7475 ; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm26, %ymm3
7476 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
7477 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
7478 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7479 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7480 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm15[1,2],ymm4[3],ymm15[4],ymm4[5],ymm15[6,7],ymm4[8],ymm15[9,10],ymm4[11],ymm15[12],ymm4[13],ymm15[14,15]
7481 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23
7482 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
7483 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
7484 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7>
7485 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12
7486 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7487 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15]
7488 ; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5
7489 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
7490 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5
7491 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
7492 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3
7493 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
7494 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
7495 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25
7496 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7497 ; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
7498 ; AVX512F-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15]
7499 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8
7500 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
7501 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
7502 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8
7503 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
7504 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm9
7505 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
7506 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
7507 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21
7508 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9
7509 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm11
7510 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15]
7511 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9
7512 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
7513 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7
7514 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13
7515 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8
7516 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
7517 ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4
7518 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4
7519 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
7520 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24
7521 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4
7522 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5
7523 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
7524 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2
7525 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4
7526 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5
7527 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
7528 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm27, %ymm4
7529 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0
7530 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
7531 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7532 ; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm26, %ymm2
7533 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
7534 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
7535 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7536 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7537 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
7538 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4
7539 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
7540 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
7541 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm3
7542 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7543 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
7544 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
7545 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2
7546 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15]
7547 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5>
7548 ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
7549 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
7550 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3
7551 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6
7552 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10,11],ymm6[12],ymm15[13],ymm6[14],ymm15[15]
7553 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7
7554 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
7555 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
7556 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6
7557 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
7558 ; AVX512F-FAST-NEXT: movb $7, %al
7559 ; AVX512F-FAST-NEXT: kmovw %eax, %k1
7560 ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
7561 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm3
7562 ; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload
7563 ; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15]
7564 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8
7565 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
7566 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
7567 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6
7568 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
7569 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
7570 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
7571 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
7572 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3
7573 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6
7574 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
7575 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6
7576 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
7577 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3
7578 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6
7579 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm6[2],ymm13[3],ymm6[4],ymm13[5,6],ymm6[7],ymm13[8,9],ymm6[10],ymm13[11],ymm6[12],ymm13[13,14],ymm6[15]
7580 ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm4
7581 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4
7582 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
7583 ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
7584 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
7585 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4
7586 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
7587 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3
7588 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm4
7589 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
7590 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
7591 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
7592 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
7593 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7594 ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi)
7595 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7596 ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rsi)
7597 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7598 ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rdx)
7599 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7600 ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rdx)
7601 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7602 ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx)
7603 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
7604 ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx)
7605 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8)
7606 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%r8)
7607 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9)
7608 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9)
7609 ; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8
7610 ; AVX512F-FAST-NEXT: vzeroupper
7611 ; AVX512F-FAST-NEXT: retq
7613 ; AVX512BW-LABEL: load_i16_stride5_vf64:
7614 ; AVX512BW: # %bb.0:
7615 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2
7616 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm4
7617 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3
7618 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5
7619 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0
7620 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7
7621 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9
7622 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11
7623 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
7624 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10
7625 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
7626 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
7627 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
7628 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13
7629 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
7630 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8
7631 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8
7632 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
7633 ; AVX512BW-NEXT: kmovd %eax, %k1
7634 ; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1}
7635 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59]
7636 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
7637 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12
7638 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6
7639 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1}
7640 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm6
7641 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
7642 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
7643 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15
7644 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15
7645 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
7646 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13
7647 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13
7648 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1}
7649 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
7650 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13
7651 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14
7652 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
7653 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1}
7654 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm12
7655 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
7656 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
7657 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17
7658 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17
7659 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
7660 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15
7661 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15
7662 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1}
7663 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
7664 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15
7665 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16
7666 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14
7667 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1}
7668 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm14
7669 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
7670 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
7671 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17
7672 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17
7673 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
7674 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19
7675 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19
7676 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
7677 ; AVX512BW-NEXT: kmovd %eax, %k1
7678 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1}
7679 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
7680 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19
7681 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16
7682 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18
7683 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1}
7684 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18
7685 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
7686 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9
7687 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
7688 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7689 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm1
7690 ; AVX512BW-NEXT: movb $7, %al
7691 ; AVX512BW-NEXT: kmovd %eax, %k1
7692 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1}
7693 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63]
7694 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1
7695 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0
7696 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3
7697 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1}
7698 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm0
7699 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi)
7700 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
7701 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx)
7702 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx)
7703 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx)
7704 ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx)
7705 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8)
7706 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8)
7707 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9)
7708 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9)
7709 ; AVX512BW-NEXT: vzeroupper
7710 ; AVX512BW-NEXT: retq
7711 %wide.vec = load <320 x i16>, ptr %in.vec, align 64
7712 %strided.vec0 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315>
7713 %strided.vec1 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316>
7714 %strided.vec2 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317>
7715 %strided.vec3 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318>
7716 %strided.vec4 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319>
7717 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
7718 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
7719 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
7720 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
7721 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
7724 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
7730 ; AVX512BW-FAST: {{.*}}
7731 ; AVX512BW-ONLY-FAST: {{.*}}
7732 ; AVX512BW-ONLY-SLOW: {{.*}}
7733 ; AVX512BW-SLOW: {{.*}}
7734 ; AVX512DQ-FAST: {{.*}}
7735 ; AVX512DQ-SLOW: {{.*}}
7736 ; AVX512DQBW-FAST: {{.*}}
7737 ; AVX512DQBW-SLOW: {{.*}}
7739 ; AVX512F-ONLY-FAST: {{.*}}
7740 ; AVX512F-ONLY-SLOW: {{.*}}
7743 ; FALLBACK10: {{.*}}
7744 ; FALLBACK11: {{.*}}
7745 ; FALLBACK12: {{.*}}