1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
13 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
14 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
15 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
16 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
22 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
23 ; AVX1-LABEL: var_funnnel_v4i64:
25 ; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
26 ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
27 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
28 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
29 ; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm6
30 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7
31 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
32 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
33 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
34 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
35 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6
36 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
37 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
38 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
39 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
40 ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
41 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
42 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
43 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
44 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
45 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3
46 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
47 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
48 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
49 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
50 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
51 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
52 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
55 ; AVX2-LABEL: var_funnnel_v4i64:
57 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
58 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
59 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
60 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
61 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
62 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
63 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
66 ; AVX512F-LABEL: var_funnnel_v4i64:
68 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
69 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
70 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
71 ; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
72 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
73 ; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
74 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
77 ; AVX512VL-LABEL: var_funnnel_v4i64:
79 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
80 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
81 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
82 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
83 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
84 ; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
85 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
88 ; AVX512BW-LABEL: var_funnnel_v4i64:
90 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
91 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
92 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
93 ; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
94 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
95 ; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
96 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
99 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
100 ; AVX512VBMI2: # %bb.0:
101 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
102 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
103 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
104 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
105 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
106 ; AVX512VBMI2-NEXT: retq
108 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
109 ; AVX512VLBW: # %bb.0:
110 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
111 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
112 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
113 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
114 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
115 ; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
116 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
117 ; AVX512VLBW-NEXT: retq
119 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
120 ; AVX512VLVBMI2: # %bb.0:
121 ; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0
122 ; AVX512VLVBMI2-NEXT: retq
124 ; XOPAVX1-LABEL: var_funnnel_v4i64:
126 ; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
127 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
129 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
130 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
131 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
132 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
133 ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
134 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
135 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
136 ; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
137 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
138 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
139 ; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
140 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
141 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
142 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
143 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
144 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
147 ; XOPAVX2-LABEL: var_funnnel_v4i64:
149 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
150 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
151 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
152 ; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
153 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
154 ; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
155 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
157 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
161 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
162 ; AVX1-LABEL: var_funnnel_v8i32:
164 ; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31]
165 ; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4
166 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
167 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
168 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
169 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7
170 ; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
171 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm3
172 ; AVX1-NEXT: vpsrld %xmm3, %xmm7, %xmm3
173 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
174 ; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
175 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
176 ; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
177 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
178 ; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
179 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
180 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
181 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
182 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
183 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
184 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
185 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
186 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
187 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
188 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
189 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
190 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
191 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
192 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
193 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
194 ; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
195 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
196 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
197 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
198 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
199 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
200 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
201 ; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3
202 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
203 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
204 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
205 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
206 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
207 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
210 ; AVX2-LABEL: var_funnnel_v8i32:
212 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
213 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
214 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
215 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
216 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
217 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
218 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
221 ; AVX512F-LABEL: var_funnnel_v8i32:
223 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
224 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
225 ; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1
226 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
227 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
228 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
229 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
232 ; AVX512VL-LABEL: var_funnnel_v8i32:
234 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
235 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
236 ; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1
237 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
238 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
239 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
240 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
241 ; AVX512VL-NEXT: retq
243 ; AVX512BW-LABEL: var_funnnel_v8i32:
245 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
246 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
247 ; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1
248 ; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
249 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
250 ; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
251 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
252 ; AVX512BW-NEXT: retq
254 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
255 ; AVX512VBMI2: # %bb.0:
256 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
257 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
258 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
259 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
260 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
261 ; AVX512VBMI2-NEXT: retq
263 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
264 ; AVX512VLBW: # %bb.0:
265 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
266 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
267 ; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1
268 ; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
269 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
270 ; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
271 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
272 ; AVX512VLBW-NEXT: retq
274 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
275 ; AVX512VLVBMI2: # %bb.0:
276 ; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0
277 ; AVX512VLVBMI2-NEXT: retq
279 ; XOPAVX1-LABEL: var_funnnel_v8i32:
281 ; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
282 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
283 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
284 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
285 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5
286 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
287 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
288 ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
289 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
290 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
291 ; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
292 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
293 ; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5
294 ; XOPAVX1-NEXT: vpshld %xmm3, %xmm5, %xmm3
295 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
296 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
297 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
298 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
299 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
302 ; XOPAVX2-LABEL: var_funnnel_v8i32:
304 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
305 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
306 ; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1
307 ; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
308 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
309 ; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
310 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
312 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
316 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
317 ; AVX1-LABEL: var_funnnel_v16i16:
319 ; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
320 ; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4
321 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
322 ; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6
323 ; AVX1-NEXT: vpsllw $4, %xmm5, %xmm5
324 ; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
325 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6
326 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
327 ; AVX1-NEXT: vpsrlw $9, %xmm7, %xmm3
328 ; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm7
329 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm7, %xmm3
330 ; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
331 ; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
332 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
333 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
334 ; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
335 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
336 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
337 ; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3
338 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
339 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
340 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
341 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
342 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6
343 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
344 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
345 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
346 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
347 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
348 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
349 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
350 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
351 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
352 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
353 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
354 ; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
355 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
356 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
357 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
358 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
359 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
360 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
361 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
362 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
363 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
364 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
365 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
366 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
367 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
368 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
369 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
370 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
371 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
372 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
373 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
374 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
375 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
376 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
377 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
378 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
379 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
382 ; AVX2-LABEL: var_funnnel_v16i16:
384 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
385 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
386 ; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
387 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
388 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
389 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
390 ; AVX2-NEXT: vpsrlvd %ymm6, %ymm7, %ymm6
391 ; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6
392 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
393 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
394 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
395 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
396 ; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1
397 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15]
398 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
399 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
400 ; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
401 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
402 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11]
403 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
404 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
405 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
406 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
407 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
410 ; AVX512F-LABEL: var_funnnel_v16i16:
412 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
413 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
414 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
415 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
416 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
417 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
418 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
419 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
420 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
421 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
422 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
423 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
426 ; AVX512VL-LABEL: var_funnnel_v16i16:
428 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
429 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
430 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
431 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
432 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
433 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
434 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
435 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
436 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
437 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
438 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
439 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
440 ; AVX512VL-NEXT: retq
442 ; AVX512BW-LABEL: var_funnnel_v16i16:
444 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
445 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
446 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
447 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
448 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
449 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
450 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
451 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
452 ; AVX512BW-NEXT: retq
454 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
455 ; AVX512VBMI2: # %bb.0:
456 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
457 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
458 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
459 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
460 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
461 ; AVX512VBMI2-NEXT: retq
463 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
464 ; AVX512VLBW: # %bb.0:
465 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
466 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
467 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
468 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
469 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
470 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
471 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
472 ; AVX512VLBW-NEXT: retq
474 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
475 ; AVX512VLVBMI2: # %bb.0:
476 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0
477 ; AVX512VLVBMI2-NEXT: retq
479 ; XOPAVX1-LABEL: var_funnnel_v16i16:
481 ; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
482 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
483 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
484 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
485 ; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5
486 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
487 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
488 ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
489 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
490 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
491 ; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3
492 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
493 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
494 ; XOPAVX1-NEXT: vpshlw %xmm3, %xmm5, %xmm3
495 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2
496 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
497 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
498 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
499 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
502 ; XOPAVX2-LABEL: var_funnnel_v16i16:
504 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
505 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
506 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
507 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
508 ; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
509 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
510 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
511 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
512 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
513 ; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
514 ; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
515 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
516 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
517 ; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
518 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
519 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
520 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
521 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
523 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
527 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
528 ; AVX1-LABEL: var_funnnel_v32i8:
530 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
531 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
532 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
533 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5
534 ; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm3
535 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
536 ; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm7
537 ; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
538 ; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm8
539 ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3
540 ; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3
541 ; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5
542 ; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm7
543 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
544 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
545 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
546 ; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5
547 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm7
548 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
549 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
550 ; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm3
551 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
552 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
553 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
554 ; AVX1-NEXT: vpand %xmm5, %xmm10, %xmm5
555 ; AVX1-NEXT: vpsllw $5, %xmm8, %xmm7
556 ; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1
557 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5
558 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
559 ; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6
560 ; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
561 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
562 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
563 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm5
564 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
565 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
566 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
567 ; AVX1-NEXT: vpsllw $4, %xmm3, %xmm4
568 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
569 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
570 ; AVX1-NEXT: vandps %ymm2, %ymm9, %ymm2
571 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
572 ; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6
573 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
574 ; AVX1-NEXT: vpsllw $2, %xmm3, %xmm4
575 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
576 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
577 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
578 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
579 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4
580 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6
581 ; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
582 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
583 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
584 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
585 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
586 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
587 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
588 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
589 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
590 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
591 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
592 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
593 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
594 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
597 ; AVX2-LABEL: var_funnnel_v32i8:
599 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
600 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
601 ; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4
602 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
603 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
604 ; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
605 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6
606 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
607 ; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
608 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm6
609 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
610 ; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
611 ; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
612 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm6
613 ; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm5
614 ; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
615 ; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
616 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
617 ; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2
618 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
619 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4
620 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
621 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
622 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
623 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
624 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
625 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
626 ; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
627 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
628 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
631 ; AVX512F-LABEL: var_funnnel_v32i8:
633 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
634 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
635 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
636 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
637 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
638 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
639 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
640 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
641 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
642 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6
643 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
644 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
645 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
646 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6
647 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
648 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
649 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
650 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
651 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
652 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
653 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
654 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
655 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
656 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
657 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
658 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
659 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
660 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
661 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
662 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
665 ; AVX512VL-LABEL: var_funnnel_v32i8:
667 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
668 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
669 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
670 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
671 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
672 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
673 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
674 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
675 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
676 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
677 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
678 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
679 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
680 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
681 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
682 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
683 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
684 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
685 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
686 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
687 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
688 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
689 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
690 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
691 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
692 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
693 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
694 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
695 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
696 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
697 ; AVX512VL-NEXT: retq
699 ; AVX512BW-LABEL: var_funnnel_v32i8:
701 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
702 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
703 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
704 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
705 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
706 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
707 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
708 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
709 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
710 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
711 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
712 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
713 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
714 ; AVX512BW-NEXT: retq
716 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
717 ; AVX512VBMI2: # %bb.0:
718 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
719 ; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
720 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
721 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
722 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
723 ; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
724 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
725 ; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
726 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
727 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
728 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
729 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
730 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
731 ; AVX512VBMI2-NEXT: retq
733 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
734 ; AVX512VLBW: # %bb.0:
735 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
736 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
737 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
738 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
739 ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
740 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
741 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
742 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
743 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
744 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
745 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
746 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
747 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
748 ; AVX512VLBW-NEXT: retq
750 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
751 ; AVX512VLVBMI2: # %bb.0:
752 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
753 ; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
754 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
755 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
756 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
757 ; AVX512VLVBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2
758 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
759 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
760 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
761 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
762 ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
763 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
764 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
765 ; AVX512VLVBMI2-NEXT: retq
767 ; XOPAVX1-LABEL: var_funnnel_v32i8:
769 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
770 ; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
771 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3
772 ; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
773 ; XOPAVX1-NEXT: vandnps %ymm8, %ymm2, %ymm6
774 ; XOPAVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
775 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
776 ; XOPAVX1-NEXT: vpsubb %xmm7, %xmm5, %xmm7
777 ; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3
778 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
779 ; XOPAVX1-NEXT: vpsubb %xmm6, %xmm5, %xmm4
780 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
781 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
782 ; XOPAVX1-NEXT: vandps %ymm2, %ymm8, %ymm2
783 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
784 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
785 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm4, %xmm3
786 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
787 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
788 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
791 ; XOPAVX2-LABEL: var_funnnel_v32i8:
793 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
794 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
795 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
796 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
797 ; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5
798 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
799 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
800 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
801 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
802 ; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
803 ; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3
804 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
805 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
806 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
807 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3
808 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2
809 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
810 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
811 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
813 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
818 ; Uniform Variable Shifts
821 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
822 ; AVX1-LABEL: splatvar_funnnel_v4i64:
824 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
825 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
826 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
827 ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
828 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
829 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
830 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
831 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
832 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
833 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
834 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
835 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
836 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
837 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
840 ; AVX2-LABEL: splatvar_funnnel_v4i64:
842 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
843 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
844 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
845 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
846 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
847 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
848 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
851 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
853 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
854 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
855 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
856 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
857 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
858 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
859 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
862 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
864 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
865 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
866 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
867 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
868 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
869 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
870 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
871 ; AVX512VL-NEXT: retq
873 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
875 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
876 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
877 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
878 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
879 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
880 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
881 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
882 ; AVX512BW-NEXT: retq
884 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
885 ; AVX512VBMI2: # %bb.0:
886 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
887 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
888 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
889 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
890 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
891 ; AVX512VBMI2-NEXT: retq
893 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
894 ; AVX512VLBW: # %bb.0:
895 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
896 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
897 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
898 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
899 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
900 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
901 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
902 ; AVX512VLBW-NEXT: retq
904 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
905 ; AVX512VLVBMI2: # %bb.0:
906 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
907 ; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0
908 ; AVX512VLVBMI2-NEXT: retq
910 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
912 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
913 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
914 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
915 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
916 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
917 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
918 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
919 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
920 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
921 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
922 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
923 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
924 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
925 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
928 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
930 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
931 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
932 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
933 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
934 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
935 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
936 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
938 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
939 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
943 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
944 ; AVX1-LABEL: splatvar_funnnel_v8i32:
946 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
947 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
948 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
949 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
950 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
951 ; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5
952 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
953 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
954 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
955 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
956 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
957 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
958 ; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3
959 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
960 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
961 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
964 ; AVX2-LABEL: splatvar_funnnel_v8i32:
966 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
967 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
968 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
969 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
970 ; AVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
971 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
972 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
973 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0
974 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
977 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
979 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
980 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
981 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
982 ; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1
983 ; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1
984 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
985 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
986 ; AVX512F-NEXT: vpslld %xmm2, %ymm0, %ymm0
987 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
990 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
992 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
993 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
994 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
995 ; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1
996 ; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1
997 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
998 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
999 ; AVX512VL-NEXT: vpslld %xmm2, %ymm0, %ymm0
1000 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1001 ; AVX512VL-NEXT: retq
1003 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
1004 ; AVX512BW: # %bb.0:
1005 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1006 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1007 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1008 ; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1
1009 ; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
1010 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1011 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1012 ; AVX512BW-NEXT: vpslld %xmm2, %ymm0, %ymm0
1013 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1014 ; AVX512BW-NEXT: retq
1016 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
1017 ; AVX512VBMI2: # %bb.0:
1018 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1019 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1020 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
1021 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
1022 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1023 ; AVX512VBMI2-NEXT: retq
1025 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
1026 ; AVX512VLBW: # %bb.0:
1027 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1028 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1029 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1030 ; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1
1031 ; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
1032 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1033 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1034 ; AVX512VLBW-NEXT: vpslld %xmm2, %ymm0, %ymm0
1035 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1036 ; AVX512VLBW-NEXT: retq
1038 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
1039 ; AVX512VLVBMI2: # %bb.0:
1040 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
1041 ; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0
1042 ; AVX512VLVBMI2-NEXT: retq
1044 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
1046 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
1047 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1048 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1049 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1050 ; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5
1051 ; XOPAVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5
1052 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1053 ; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1054 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1055 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1056 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1057 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1058 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3
1059 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
1060 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1061 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1062 ; XOPAVX1-NEXT: retq
1064 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
1066 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1067 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1068 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1069 ; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1
1070 ; XOPAVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
1071 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1072 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1073 ; XOPAVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0
1074 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1075 ; XOPAVX2-NEXT: retq
1076 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
1077 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
1081 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
1082 ; AVX1-LABEL: splatvar_funnnel_v16i16:
1084 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1085 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1086 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1087 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1088 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
1089 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
1090 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1091 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1092 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1093 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1094 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1095 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1096 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1097 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1098 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1099 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1102 ; AVX2-LABEL: splatvar_funnnel_v16i16:
1104 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1105 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1106 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1107 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1108 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1109 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1110 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1111 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1112 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1115 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
1117 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1118 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
1119 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1120 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1121 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1122 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
1123 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1124 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1125 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1126 ; AVX512F-NEXT: retq
1128 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
1129 ; AVX512VL: # %bb.0:
1130 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1131 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
1132 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1133 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
1134 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1135 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
1136 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1137 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1138 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1139 ; AVX512VL-NEXT: retq
1141 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
1142 ; AVX512BW: # %bb.0:
1143 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1144 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1145 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1146 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
1147 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1148 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1149 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1150 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1151 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1152 ; AVX512BW-NEXT: retq
1154 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
1155 ; AVX512VBMI2: # %bb.0:
1156 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1157 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1158 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
1159 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
1160 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1161 ; AVX512VBMI2-NEXT: retq
1163 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
1164 ; AVX512VLBW: # %bb.0:
1165 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1166 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1167 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1168 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
1169 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1170 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1171 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1172 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1173 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1174 ; AVX512VLBW-NEXT: retq
1176 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
1177 ; AVX512VLVBMI2: # %bb.0:
1178 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
1179 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0
1180 ; AVX512VLVBMI2-NEXT: retq
1182 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
1184 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1185 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1186 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1187 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1188 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
1189 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
1190 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1191 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1192 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1193 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1194 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1195 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1196 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1197 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1198 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1199 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1200 ; XOPAVX1-NEXT: retq
1202 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
1204 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1205 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1206 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1207 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1208 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1209 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1210 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1211 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1212 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1213 ; XOPAVX2-NEXT: retq
1214 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
1215 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
1219 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
1220 ; AVX1-LABEL: splatvar_funnnel_v32i8:
1222 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1223 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
1224 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
1225 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1226 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1227 ; AVX1-NEXT: vpandn %xmm8, %xmm2, %xmm6
1228 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
1229 ; AVX1-NEXT: vpsrlw %xmm6, %xmm3, %xmm3
1230 ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7
1231 ; AVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm5
1232 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1233 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
1234 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1235 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1236 ; AVX1-NEXT: vpsrlw %xmm6, %xmm1, %xmm1
1237 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
1238 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1239 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2
1240 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1241 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1242 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1243 ; AVX1-NEXT: vpsllw %xmm2, %xmm7, %xmm4
1244 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
1245 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
1246 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1247 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1248 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1249 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1250 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1253 ; AVX2-LABEL: splatvar_funnnel_v32i8:
1255 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1256 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1257 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1258 ; AVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
1259 ; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1260 ; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1261 ; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4
1262 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
1263 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1264 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1265 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1266 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1267 ; AVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
1268 ; AVX2-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
1269 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
1270 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
1271 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1272 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1275 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
1277 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1278 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1279 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1280 ; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
1281 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1282 ; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1283 ; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
1284 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
1285 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1286 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1287 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1288 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1289 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
1290 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
1291 ; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
1292 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
1293 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
1294 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1295 ; AVX512F-NEXT: retq
1297 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
1298 ; AVX512VL: # %bb.0:
1299 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1300 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1301 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1302 ; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
1303 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1304 ; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1305 ; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
1306 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm4
1307 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm0
1308 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1309 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
1310 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1311 ; AVX512VL-NEXT: vpsrlw %xmm0, %ymm1, %ymm1
1312 ; AVX512VL-NEXT: vpsrlw %xmm0, %xmm5, %xmm0
1313 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
1314 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
1315 ; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm4, %ymm0
1316 ; AVX512VL-NEXT: retq
1318 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
1319 ; AVX512BW: # %bb.0:
1320 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1321 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1322 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1323 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
1324 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1325 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1326 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
1327 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1328 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1329 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1330 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
1331 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1332 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1333 ; AVX512BW-NEXT: retq
1335 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
1336 ; AVX512VBMI2: # %bb.0:
1337 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1338 ; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1339 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1340 ; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
1341 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1342 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1343 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
1344 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
1345 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1346 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1347 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0
1348 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1349 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1350 ; AVX512VBMI2-NEXT: retq
1352 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
1353 ; AVX512VLBW: # %bb.0:
1354 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1355 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1356 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1357 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
1358 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1359 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1360 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
1361 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1362 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1363 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1364 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
1365 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1366 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1367 ; AVX512VLBW-NEXT: retq
1369 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
1370 ; AVX512VLVBMI2: # %bb.0:
1371 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1372 ; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1373 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1374 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
1375 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1376 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1377 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
1378 ; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
1379 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1380 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1381 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0
1382 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1383 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1384 ; AVX512VLVBMI2-NEXT: retq
1386 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
1388 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1389 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1390 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1391 ; XOPAVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1392 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm4, %xmm4
1393 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1394 ; XOPAVX1-NEXT: vpandn %xmm6, %xmm2, %xmm7
1395 ; XOPAVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
1396 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm4, %xmm4
1397 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1
1398 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
1399 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1400 ; XOPAVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
1401 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1402 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm3, %xmm3
1403 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1404 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1405 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1406 ; XOPAVX1-NEXT: retq
1408 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
1410 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1411 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1412 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1413 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
1414 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm5, %xmm5
1415 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
1416 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
1417 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1418 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1419 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
1420 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1421 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1422 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
1423 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm3, %xmm3
1424 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1425 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
1426 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1427 ; XOPAVX2-NEXT: retq
1428 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1429 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
1433 ; Harder PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
1434 ; CGP should sink splatted select operands through the funnel shift.
1436 define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) {
1437 ; AVX1-LABEL: fancierRotate2:
1438 ; AVX1: # %bb.0: # %entry
1439 ; AVX1-NEXT: vmovd %edx, %xmm1
1440 ; AVX1-NEXT: vmovd %ecx, %xmm3
1441 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1442 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1443 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31]
1444 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm2
1445 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero
1446 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
1447 ; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
1448 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero
1449 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4
1450 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero
1451 ; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
1452 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1453 ; AVX1-NEXT: .p2align 4, 0x90
1454 ; AVX1-NEXT: .LBB8_1: # %loop
1455 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
1456 ; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
1457 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm5
1458 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm6
1459 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
1460 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
1461 ; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm7
1462 ; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm0
1463 ; AVX1-NEXT: vpslld %xmm9, %xmm7, %xmm1
1464 ; AVX1-NEXT: vpsrld %xmm10, %xmm7, %xmm2
1465 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1466 ; AVX1-NEXT: vpslld %xmm9, %xmm0, %xmm2
1467 ; AVX1-NEXT: vpsrld %xmm10, %xmm0, %xmm3
1468 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
1469 ; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm3
1470 ; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm7
1471 ; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3
1472 ; AVX1-NEXT: vblendvps %xmm6, %xmm1, %xmm3, %xmm1
1473 ; AVX1-NEXT: vpslld %xmm11, %xmm0, %xmm3
1474 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm0
1475 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
1476 ; AVX1-NEXT: vblendvps %xmm5, %xmm2, %xmm0, %xmm0
1477 ; AVX1-NEXT: vmovups %xmm1, 4096(%rdi,%rax,4)
1478 ; AVX1-NEXT: vmovups %xmm0, 4112(%rdi,%rax,4)
1479 ; AVX1-NEXT: addq $8, %rax
1480 ; AVX1-NEXT: jne .LBB8_1
1481 ; AVX1-NEXT: # %bb.2: # %exit
1482 ; AVX1-NEXT: vzeroupper
1485 ; AVX2-LABEL: fancierRotate2:
1486 ; AVX2: # %bb.0: # %entry
1487 ; AVX2-NEXT: vmovd %edx, %xmm0
1488 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1489 ; AVX2-NEXT: vmovd %ecx, %xmm1
1490 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1491 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1492 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1493 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
1494 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
1495 ; AVX2-NEXT: .p2align 4, 0x90
1496 ; AVX2-NEXT: .LBB8_1: # %loop
1497 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
1498 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1499 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5
1500 ; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5
1501 ; AVX2-NEXT: vandps %ymm3, %ymm5, %ymm5
1502 ; AVX2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm6
1503 ; AVX2-NEXT: vpsllvd %ymm5, %ymm6, %ymm7
1504 ; AVX2-NEXT: vpsubd %ymm5, %ymm4, %ymm5
1505 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5
1506 ; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5
1507 ; AVX2-NEXT: vmovdqu %ymm5, 4096(%rdi,%rax,4)
1508 ; AVX2-NEXT: addq $8, %rax
1509 ; AVX2-NEXT: jne .LBB8_1
1510 ; AVX2-NEXT: # %bb.2: # %exit
1511 ; AVX2-NEXT: vzeroupper
1514 ; AVX512F-LABEL: fancierRotate2:
1515 ; AVX512F: # %bb.0: # %entry
1516 ; AVX512F-NEXT: vmovd %edx, %xmm0
1517 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0
1518 ; AVX512F-NEXT: vmovd %ecx, %xmm1
1519 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1520 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
1521 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1522 ; AVX512F-NEXT: .p2align 4, 0x90
1523 ; AVX512F-NEXT: .LBB8_1: # %loop
1524 ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
1525 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1526 ; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1527 ; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1528 ; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1529 ; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1530 ; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1531 ; AVX512F-NEXT: addq $8, %rax
1532 ; AVX512F-NEXT: jne .LBB8_1
1533 ; AVX512F-NEXT: # %bb.2: # %exit
1534 ; AVX512F-NEXT: vzeroupper
1535 ; AVX512F-NEXT: retq
1537 ; AVX512VL-LABEL: fancierRotate2:
1538 ; AVX512VL: # %bb.0: # %entry
1539 ; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0
1540 ; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1
1541 ; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00
1542 ; AVX512VL-NEXT: .p2align 4, 0x90
1543 ; AVX512VL-NEXT: .LBB8_1: # %loop
1544 ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
1545 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1546 ; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
1547 ; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1548 ; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1549 ; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1550 ; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1551 ; AVX512VL-NEXT: addq $8, %rax
1552 ; AVX512VL-NEXT: jne .LBB8_1
1553 ; AVX512VL-NEXT: # %bb.2: # %exit
1554 ; AVX512VL-NEXT: vzeroupper
1555 ; AVX512VL-NEXT: retq
1557 ; AVX512BW-LABEL: fancierRotate2:
1558 ; AVX512BW: # %bb.0: # %entry
1559 ; AVX512BW-NEXT: vmovd %edx, %xmm0
1560 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
1561 ; AVX512BW-NEXT: vmovd %ecx, %xmm1
1562 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
1563 ; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
1564 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1565 ; AVX512BW-NEXT: .p2align 4, 0x90
1566 ; AVX512BW-NEXT: .LBB8_1: # %loop
1567 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
1568 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1569 ; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1570 ; AVX512BW-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1571 ; AVX512BW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1572 ; AVX512BW-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1573 ; AVX512BW-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1574 ; AVX512BW-NEXT: addq $8, %rax
1575 ; AVX512BW-NEXT: jne .LBB8_1
1576 ; AVX512BW-NEXT: # %bb.2: # %exit
1577 ; AVX512BW-NEXT: vzeroupper
1578 ; AVX512BW-NEXT: retq
1580 ; AVX512VBMI2-LABEL: fancierRotate2:
1581 ; AVX512VBMI2: # %bb.0: # %entry
1582 ; AVX512VBMI2-NEXT: vmovd %edx, %xmm0
1583 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %ymm0
1584 ; AVX512VBMI2-NEXT: vmovd %ecx, %xmm1
1585 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
1586 ; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
1587 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1588 ; AVX512VBMI2-NEXT: .p2align 4, 0x90
1589 ; AVX512VBMI2-NEXT: .LBB8_1: # %loop
1590 ; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1
1591 ; AVX512VBMI2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1592 ; AVX512VBMI2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1593 ; AVX512VBMI2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1594 ; AVX512VBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1595 ; AVX512VBMI2-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1596 ; AVX512VBMI2-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1597 ; AVX512VBMI2-NEXT: addq $8, %rax
1598 ; AVX512VBMI2-NEXT: jne .LBB8_1
1599 ; AVX512VBMI2-NEXT: # %bb.2: # %exit
1600 ; AVX512VBMI2-NEXT: vzeroupper
1601 ; AVX512VBMI2-NEXT: retq
1603 ; AVX512VLBW-LABEL: fancierRotate2:
1604 ; AVX512VLBW: # %bb.0: # %entry
1605 ; AVX512VLBW-NEXT: vpbroadcastd %edx, %ymm0
1606 ; AVX512VLBW-NEXT: vpbroadcastd %ecx, %ymm1
1607 ; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00
1608 ; AVX512VLBW-NEXT: .p2align 4, 0x90
1609 ; AVX512VLBW-NEXT: .LBB8_1: # %loop
1610 ; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1
1611 ; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1612 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
1613 ; AVX512VLBW-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1614 ; AVX512VLBW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1615 ; AVX512VLBW-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1616 ; AVX512VLBW-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1617 ; AVX512VLBW-NEXT: addq $8, %rax
1618 ; AVX512VLBW-NEXT: jne .LBB8_1
1619 ; AVX512VLBW-NEXT: # %bb.2: # %exit
1620 ; AVX512VLBW-NEXT: vzeroupper
1621 ; AVX512VLBW-NEXT: retq
1623 ; AVX512VLVBMI2-LABEL: fancierRotate2:
1624 ; AVX512VLVBMI2: # %bb.0: # %entry
1625 ; AVX512VLVBMI2-NEXT: vpbroadcastd %edx, %ymm0
1626 ; AVX512VLVBMI2-NEXT: vpbroadcastd %ecx, %ymm1
1627 ; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
1628 ; AVX512VLVBMI2-NEXT: .p2align 4, 0x90
1629 ; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop
1630 ; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1
1631 ; AVX512VLVBMI2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1632 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
1633 ; AVX512VLVBMI2-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1634 ; AVX512VLVBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1635 ; AVX512VLVBMI2-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1636 ; AVX512VLVBMI2-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1637 ; AVX512VLVBMI2-NEXT: addq $8, %rax
1638 ; AVX512VLVBMI2-NEXT: jne .LBB8_1
1639 ; AVX512VLVBMI2-NEXT: # %bb.2: # %exit
1640 ; AVX512VLVBMI2-NEXT: vzeroupper
1641 ; AVX512VLVBMI2-NEXT: retq
1643 ; XOPAVX1-LABEL: fancierRotate2:
1644 ; XOPAVX1: # %bb.0: # %entry
1645 ; XOPAVX1-NEXT: vmovd %edx, %xmm0
1646 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1647 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1648 ; XOPAVX1-NEXT: vmovd %ecx, %xmm1
1649 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1650 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
1651 ; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1652 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1653 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1654 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1655 ; XOPAVX1-NEXT: .p2align 4, 0x90
1656 ; XOPAVX1-NEXT: .LBB8_1: # %loop
1657 ; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1
1658 ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
1659 ; XOPAVX1-NEXT: vpcomeqb %xmm2, %xmm5, %xmm5
1660 ; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm6
1661 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
1662 ; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm5
1663 ; XOPAVX1-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5
1664 ; XOPAVX1-NEXT: vprotd %xmm5, 4112(%rdi,%rax,4), %xmm5
1665 ; XOPAVX1-NEXT: vblendvps %xmm6, %xmm0, %xmm1, %xmm6
1666 ; XOPAVX1-NEXT: vprotd %xmm6, 4096(%rdi,%rax,4), %xmm6
1667 ; XOPAVX1-NEXT: vmovdqu %xmm6, 4096(%rdi,%rax,4)
1668 ; XOPAVX1-NEXT: vmovdqu %xmm5, 4112(%rdi,%rax,4)
1669 ; XOPAVX1-NEXT: addq $8, %rax
1670 ; XOPAVX1-NEXT: jne .LBB8_1
1671 ; XOPAVX1-NEXT: # %bb.2: # %exit
1672 ; XOPAVX1-NEXT: vzeroupper
1673 ; XOPAVX1-NEXT: retq
1675 ; XOPAVX2-LABEL: fancierRotate2:
1676 ; XOPAVX2: # %bb.0: # %entry
1677 ; XOPAVX2-NEXT: vmovd %edx, %xmm0
1678 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1679 ; XOPAVX2-NEXT: vmovd %ecx, %xmm1
1680 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1681 ; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1682 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1683 ; XOPAVX2-NEXT: .p2align 4, 0x90
1684 ; XOPAVX2-NEXT: .LBB8_1: # %loop
1685 ; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1
1686 ; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1687 ; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1688 ; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1689 ; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
1690 ; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4
1691 ; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3
1692 ; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4)
1693 ; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4)
1694 ; XOPAVX2-NEXT: addq $8, %rax
1695 ; XOPAVX2-NEXT: jne .LBB8_1
1696 ; XOPAVX2-NEXT: # %bb.2: # %exit
1697 ; XOPAVX2-NEXT: vzeroupper
1698 ; XOPAVX2-NEXT: retq
1700 %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0
1701 %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
1702 %i1 = insertelement <8 x i32> undef, i32 %rot1, i32 0
1703 %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
1707 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
1708 %t0 = getelementptr inbounds i8, i8* %control, i64 %index
1709 %t1 = bitcast i8* %t0 to <8 x i8>*
1710 %wide.load = load <8 x i8>, <8 x i8>* %t1, align 1
1711 %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer
1712 %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1
1713 %t4 = getelementptr inbounds i32, i32* %arr, i64 %index
1714 %t5 = bitcast i32* %t4 to <8 x i32>*
1715 %wide.load21 = load <8 x i32>, <8 x i32>* %t5, align 4
1716 %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt)
1717 store <8 x i32> %rot, <8 x i32>* %t5, align 4
1718 %index.next = add i64 %index, 8
1719 %t7 = icmp eq i64 %index.next, 1024
1720 br i1 %t7, label %exit, label %loop
1730 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
1731 ; AVX1-LABEL: constant_funnnel_v4i64:
1733 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1734 ; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm3
1735 ; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm2
1736 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1737 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm3
1738 ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1
1739 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1740 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1741 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1742 ; AVX1-NEXT: vpsllq $60, %xmm2, %xmm3
1743 ; AVX1-NEXT: vpsllq $50, %xmm2, %xmm2
1744 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1745 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1746 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1747 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1748 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1749 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1752 ; AVX2-LABEL: constant_funnnel_v4i64:
1754 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1755 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1756 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1759 ; AVX512F-LABEL: constant_funnnel_v4i64:
1761 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1762 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1763 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1764 ; AVX512F-NEXT: retq
1766 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1767 ; AVX512VL: # %bb.0:
1768 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1769 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1770 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1771 ; AVX512VL-NEXT: retq
1773 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1774 ; AVX512BW: # %bb.0:
1775 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1776 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1777 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1778 ; AVX512BW-NEXT: retq
1780 ; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1781 ; AVX512VBMI2: # %bb.0:
1782 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1783 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1784 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60]
1785 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
1786 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1787 ; AVX512VBMI2-NEXT: retq
1789 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1790 ; AVX512VLBW: # %bb.0:
1791 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1792 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1793 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1794 ; AVX512VLBW-NEXT: retq
1796 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
1797 ; AVX512VLVBMI2: # %bb.0:
1798 ; AVX512VLVBMI2-NEXT: vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1799 ; AVX512VLVBMI2-NEXT: retq
1801 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1803 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1804 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1805 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1806 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1807 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1808 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1809 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1810 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1811 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1812 ; XOPAVX1-NEXT: retq
1814 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1816 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1817 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1818 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1819 ; XOPAVX2-NEXT: retq
1820 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1824 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
1825 ; AVX1-LABEL: constant_funnnel_v8i32:
1827 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1828 ; AVX1-NEXT: vpsrld $21, %xmm2, %xmm3
1829 ; AVX1-NEXT: vpsrld $23, %xmm2, %xmm4
1830 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1831 ; AVX1-NEXT: vpsrld $22, %xmm2, %xmm4
1832 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
1833 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1834 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1835 ; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3
1836 ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm4
1837 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1838 ; AVX1-NEXT: vpsrld $26, %xmm1, %xmm4
1839 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
1840 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1841 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1842 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1843 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1844 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1845 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1846 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1847 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1850 ; AVX2-LABEL: constant_funnnel_v8i32:
1852 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1853 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1854 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1857 ; AVX512F-LABEL: constant_funnnel_v8i32:
1859 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1860 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1861 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1862 ; AVX512F-NEXT: retq
1864 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1865 ; AVX512VL: # %bb.0:
1866 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1867 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1868 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1869 ; AVX512VL-NEXT: retq
1871 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1872 ; AVX512BW: # %bb.0:
1873 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1874 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1875 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1876 ; AVX512BW-NEXT: retq
1878 ; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1879 ; AVX512VBMI2: # %bb.0:
1880 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1881 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1882 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11]
1883 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
1884 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1885 ; AVX512VBMI2-NEXT: retq
1887 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1888 ; AVX512VLBW: # %bb.0:
1889 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1890 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1891 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1892 ; AVX512VLBW-NEXT: retq
1894 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
1895 ; AVX512VLVBMI2: # %bb.0:
1896 ; AVX512VLVBMI2-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1897 ; AVX512VLVBMI2-NEXT: retq
1899 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1901 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1902 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1903 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1904 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1905 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1906 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1907 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1908 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1909 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1910 ; XOPAVX1-NEXT: retq
1912 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1914 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1915 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1916 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1917 ; XOPAVX2-NEXT: retq
1918 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1922 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
1923 ; AVX1-LABEL: constant_funnnel_v16i16:
1925 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1926 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
1927 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1928 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7]
1929 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1930 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1931 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1932 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1933 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1934 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1935 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1936 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1939 ; AVX2-LABEL: constant_funnnel_v16i16:
1941 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1942 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1943 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1944 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1945 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1946 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1949 ; AVX512F-LABEL: constant_funnnel_v16i16:
1951 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1952 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1953 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1954 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1955 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1956 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1957 ; AVX512F-NEXT: retq
1959 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1960 ; AVX512VL: # %bb.0:
1961 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
1962 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1963 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1964 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1965 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1966 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1967 ; AVX512VL-NEXT: retq
1969 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1970 ; AVX512BW: # %bb.0:
1971 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1972 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1973 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1974 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
1975 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
1976 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1977 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1978 ; AVX512BW-NEXT: retq
1980 ; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1981 ; AVX512VBMI2: # %bb.0:
1982 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1983 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1984 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1985 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
1986 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1987 ; AVX512VBMI2-NEXT: retq
1989 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1990 ; AVX512VLBW: # %bb.0:
1991 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1992 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
1993 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1994 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1995 ; AVX512VLBW-NEXT: retq
1997 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
1998 ; AVX512VLVBMI2: # %bb.0:
1999 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
2000 ; AVX512VLVBMI2-NEXT: retq
2002 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
2004 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
2005 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2006 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2007 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2008 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
2009 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2010 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2011 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
2012 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2013 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2014 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2015 ; XOPAVX1-NEXT: retq
2017 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
2019 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
2020 ; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
2021 ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
2022 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2023 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2024 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2025 ; XOPAVX2-NEXT: retq
2026 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
2030 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
2031 ; AVX1-LABEL: constant_funnnel_v32i8:
2033 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2034 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2035 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
2036 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
2037 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2038 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
2039 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2040 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128]
2041 ; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2
2042 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
2043 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2044 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2045 ; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
2046 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
2047 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2048 ; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0
2049 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
2050 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2051 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2052 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2053 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
2054 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
2055 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2056 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
2057 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
2058 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2,256,128,64,32,16,8,4]
2059 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5
2060 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
2061 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2062 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [2,4,8,16,32,64,128,256]
2063 ; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2
2064 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2065 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
2066 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
2067 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2068 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
2069 ; AVX1-NEXT: vpmullw %xmm6, %xmm3, %xmm3
2070 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2071 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2072 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
2073 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2074 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
2075 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2076 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2079 ; AVX2-LABEL: constant_funnnel_v32i8:
2081 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
2082 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2083 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
2084 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
2085 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2086 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
2087 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2088 ; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2089 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2090 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
2091 ; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2092 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2093 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
2094 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2095 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
2096 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
2097 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2098 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
2099 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
2100 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2101 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2102 ; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
2103 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2106 ; AVX512F-LABEL: constant_funnnel_v32i8:
2108 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
2109 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2110 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
2111 ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
2112 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2113 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
2114 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2115 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2116 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2117 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
2118 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2119 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2120 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
2121 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2122 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
2123 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
2124 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2125 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
2126 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
2127 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2128 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
2129 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
2130 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2131 ; AVX512F-NEXT: retq
2133 ; AVX512VL-LABEL: constant_funnnel_v32i8:
2134 ; AVX512VL: # %bb.0:
2135 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
2136 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2137 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
2138 ; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
2139 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2140 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
2141 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2142 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2143 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2144 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
2145 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
2146 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2147 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
2148 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2149 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
2150 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
2151 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
2152 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
2153 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
2154 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2155 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
2156 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
2157 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2158 ; AVX512VL-NEXT: retq
2160 ; AVX512BW-LABEL: constant_funnnel_v32i8:
2161 ; AVX512BW: # %bb.0:
2162 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2163 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2164 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
2165 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2166 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2167 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2168 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
2169 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2170 ; AVX512BW-NEXT: retq
2172 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
2173 ; AVX512VBMI2: # %bb.0:
2174 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2175 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2176 ; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
2177 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2178 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2179 ; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2180 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
2181 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2182 ; AVX512VBMI2-NEXT: retq
2184 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
2185 ; AVX512VLBW: # %bb.0:
2186 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2187 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2188 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
2189 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2190 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2191 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2192 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
2193 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
2194 ; AVX512VLBW-NEXT: retq
2196 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
2197 ; AVX512VLVBMI2: # %bb.0:
2198 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
2199 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2200 ; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1
2201 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2202 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
2203 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2204 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
2205 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
2206 ; AVX512VLVBMI2-NEXT: retq
2208 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
2210 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2211 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2212 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2213 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
2214 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2215 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2216 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2217 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2218 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
2219 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm2, %xmm2
2220 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2221 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
2222 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2223 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2224 ; XOPAVX1-NEXT: retq
2226 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
2228 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2229 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2230 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2231 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0
2232 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2233 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
2234 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2235 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2236 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
2237 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2238 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2239 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2240 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2241 ; XOPAVX2-NEXT: retq
2242 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2247 ; Uniform Constant Shifts
2250 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
2251 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
2253 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm2
2254 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2255 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm1
2256 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2257 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2
2258 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2259 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
2260 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2261 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2264 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
2266 ; AVX2-NEXT: vpsrlq $50, %ymm1, %ymm1
2267 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
2268 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2271 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
2273 ; AVX512F-NEXT: vpsrlq $50, %ymm1, %ymm1
2274 ; AVX512F-NEXT: vpsllq $14, %ymm0, %ymm0
2275 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2276 ; AVX512F-NEXT: retq
2278 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
2279 ; AVX512VL: # %bb.0:
2280 ; AVX512VL-NEXT: vpsrlq $50, %ymm1, %ymm1
2281 ; AVX512VL-NEXT: vpsllq $14, %ymm0, %ymm0
2282 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2283 ; AVX512VL-NEXT: retq
2285 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
2286 ; AVX512BW: # %bb.0:
2287 ; AVX512BW-NEXT: vpsrlq $50, %ymm1, %ymm1
2288 ; AVX512BW-NEXT: vpsllq $14, %ymm0, %ymm0
2289 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2290 ; AVX512BW-NEXT: retq
2292 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
2293 ; AVX512VBMI2: # %bb.0:
2294 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2295 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2296 ; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
2297 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2298 ; AVX512VBMI2-NEXT: retq
2300 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
2301 ; AVX512VLBW: # %bb.0:
2302 ; AVX512VLBW-NEXT: vpsrlq $50, %ymm1, %ymm1
2303 ; AVX512VLBW-NEXT: vpsllq $14, %ymm0, %ymm0
2304 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2305 ; AVX512VLBW-NEXT: retq
2307 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
2308 ; AVX512VLVBMI2: # %bb.0:
2309 ; AVX512VLVBMI2-NEXT: vpshldq $14, %ymm1, %ymm0, %ymm0
2310 ; AVX512VLVBMI2-NEXT: retq
2312 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
2314 ; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm2
2315 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2316 ; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm1
2317 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2318 ; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm2
2319 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2320 ; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm0
2321 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2322 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2323 ; XOPAVX1-NEXT: retq
2325 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
2327 ; XOPAVX2-NEXT: vpsrlq $50, %ymm1, %ymm1
2328 ; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm0
2329 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2330 ; XOPAVX2-NEXT: retq
2331 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
2335 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
2336 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
2338 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2339 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2340 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
2341 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2342 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm2
2343 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2344 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
2345 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2346 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2349 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
2351 ; AVX2-NEXT: vpsrld $28, %ymm1, %ymm1
2352 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
2353 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2356 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
2358 ; AVX512F-NEXT: vpsrld $28, %ymm1, %ymm1
2359 ; AVX512F-NEXT: vpslld $4, %ymm0, %ymm0
2360 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2361 ; AVX512F-NEXT: retq
2363 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
2364 ; AVX512VL: # %bb.0:
2365 ; AVX512VL-NEXT: vpsrld $28, %ymm1, %ymm1
2366 ; AVX512VL-NEXT: vpslld $4, %ymm0, %ymm0
2367 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2368 ; AVX512VL-NEXT: retq
2370 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
2371 ; AVX512BW: # %bb.0:
2372 ; AVX512BW-NEXT: vpsrld $28, %ymm1, %ymm1
2373 ; AVX512BW-NEXT: vpslld $4, %ymm0, %ymm0
2374 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2375 ; AVX512BW-NEXT: retq
2377 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
2378 ; AVX512VBMI2: # %bb.0:
2379 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2380 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2381 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
2382 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2383 ; AVX512VBMI2-NEXT: retq
2385 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
2386 ; AVX512VLBW: # %bb.0:
2387 ; AVX512VLBW-NEXT: vpsrld $28, %ymm1, %ymm1
2388 ; AVX512VLBW-NEXT: vpslld $4, %ymm0, %ymm0
2389 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2390 ; AVX512VLBW-NEXT: retq
2392 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
2393 ; AVX512VLVBMI2: # %bb.0:
2394 ; AVX512VLVBMI2-NEXT: vpshldd $4, %ymm1, %ymm0, %ymm0
2395 ; AVX512VLVBMI2-NEXT: retq
2397 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
2399 ; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2400 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2401 ; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm1
2402 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2403 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm2
2404 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2405 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm0
2406 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2407 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2408 ; XOPAVX1-NEXT: retq
2410 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
2412 ; XOPAVX2-NEXT: vpsrld $28, %ymm1, %ymm1
2413 ; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm0
2414 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2415 ; XOPAVX2-NEXT: retq
2416 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
2420 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
2421 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
2423 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
2424 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2425 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm1
2426 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2427 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm2
2428 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2429 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
2430 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2431 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2434 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
2436 ; AVX2-NEXT: vpsrlw $9, %ymm1, %ymm1
2437 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
2438 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2441 ; AVX512F-LABEL: splatconstant_funnnel_v16i16:
2443 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
2444 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
2445 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2446 ; AVX512F-NEXT: retq
2448 ; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
2449 ; AVX512VL: # %bb.0:
2450 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
2451 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
2452 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2453 ; AVX512VL-NEXT: retq
2455 ; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
2456 ; AVX512BW: # %bb.0:
2457 ; AVX512BW-NEXT: vpsrlw $9, %ymm1, %ymm1
2458 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
2459 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2460 ; AVX512BW-NEXT: retq
2462 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
2463 ; AVX512VBMI2: # %bb.0:
2464 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2465 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2466 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
2467 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2468 ; AVX512VBMI2-NEXT: retq
2470 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
2471 ; AVX512VLBW: # %bb.0:
2472 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm1, %ymm1
2473 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
2474 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2475 ; AVX512VLBW-NEXT: retq
2477 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
2478 ; AVX512VLVBMI2: # %bb.0:
2479 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0
2480 ; AVX512VLVBMI2-NEXT: retq
2482 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
2484 ; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
2485 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2486 ; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm1
2487 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2488 ; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm2
2489 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2490 ; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm0
2491 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2492 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2493 ; XOPAVX1-NEXT: retq
2495 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
2497 ; XOPAVX2-NEXT: vpsrlw $9, %ymm1, %ymm1
2498 ; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm0
2499 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2500 ; XOPAVX2-NEXT: retq
2501 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2505 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
2506 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
2508 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2509 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2510 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2511 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2512 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2513 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2514 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2515 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2516 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
2517 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2518 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2519 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
2520 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2521 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2522 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2525 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
2527 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2528 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2529 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
2530 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2531 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2534 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
2536 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2537 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2538 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
2539 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2540 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2541 ; AVX512F-NEXT: retq
2543 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
2544 ; AVX512VL: # %bb.0:
2545 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
2546 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0
2547 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2548 ; AVX512VL-NEXT: retq
2550 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
2551 ; AVX512BW: # %bb.0:
2552 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm1
2553 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2554 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0
2555 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2556 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2557 ; AVX512BW-NEXT: retq
2559 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
2560 ; AVX512VBMI2: # %bb.0:
2561 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm1
2562 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2563 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm0
2564 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2565 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
2566 ; AVX512VBMI2-NEXT: retq
2568 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
2569 ; AVX512VLBW: # %bb.0:
2570 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2
2571 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0
2572 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2573 ; AVX512VLBW-NEXT: retq
2575 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
2576 ; AVX512VLVBMI2: # %bb.0:
2577 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm2
2578 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0
2579 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2580 ; AVX512VLVBMI2-NEXT: retq
2582 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
2584 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2585 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
2586 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2587 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2588 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2589 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2590 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
2591 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2592 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
2593 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2594 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2595 ; XOPAVX1-NEXT: retq
2597 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
2599 ; XOPAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2600 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm0
2601 ; XOPAVX2-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0
2602 ; XOPAVX2-NEXT: retq
2603 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)