1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX10,AVX10_256
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
15 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
16 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
17 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
18 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
24 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
25 ; AVX1-LABEL: var_funnnel_v4i64:
27 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
28 ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
29 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
30 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
31 ; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm6
32 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
34 ; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
36 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
37 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6
38 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
39 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
40 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
41 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
42 ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
43 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
44 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
45 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
46 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
47 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3
48 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
49 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
50 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
51 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
52 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
53 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
54 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
57 ; AVX2-LABEL: var_funnnel_v4i64:
59 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
60 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
61 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
62 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
63 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
64 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
65 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
68 ; AVX512F-LABEL: var_funnnel_v4i64:
70 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
71 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
72 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
73 ; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
74 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
75 ; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
76 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
79 ; AVX512VL-LABEL: var_funnnel_v4i64:
81 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
82 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
83 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
84 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
85 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
86 ; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
87 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
90 ; AVX512BW-LABEL: var_funnnel_v4i64:
92 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
93 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
94 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
95 ; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
96 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
97 ; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
98 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
101 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
102 ; AVX512VBMI2: # %bb.0:
103 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
104 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
105 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
106 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
107 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
108 ; AVX512VBMI2-NEXT: retq
110 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
111 ; AVX512VLBW: # %bb.0:
112 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
113 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
114 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
115 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
116 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
117 ; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
118 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
119 ; AVX512VLBW-NEXT: retq
121 ; AVX10-LABEL: var_funnnel_v4i64:
123 ; AVX10-NEXT: vpshldvq %ymm2, %ymm1, %ymm0
126 ; XOPAVX1-LABEL: var_funnnel_v4i64:
128 ; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63]
129 ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
130 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
131 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
132 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
133 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
134 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
135 ; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
136 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
137 ; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
138 ; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
139 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
140 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
141 ; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
142 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
143 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
144 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
145 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
146 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
149 ; XOPAVX2-LABEL: var_funnnel_v4i64:
151 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
152 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
153 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
154 ; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
155 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
156 ; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
157 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
159 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
163 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
164 ; AVX1-LABEL: var_funnnel_v8i32:
166 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
167 ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
168 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
169 ; AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm5
170 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
171 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
172 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7
173 ; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6
174 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm8
175 ; AVX1-NEXT: vpsrld %xmm8, %xmm7, %xmm8
176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4,5,6,7]
177 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
178 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
179 ; AVX1-NEXT: vpsrld %xmm9, %xmm7, %xmm9
180 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
181 ; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5
182 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7]
183 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
184 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
185 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
186 ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
187 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
188 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
189 ; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4
190 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
191 ; AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm3
192 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
193 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
194 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
195 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7
196 ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
197 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7]
198 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
199 ; AVX1-NEXT: vpsrld %xmm7, %xmm1, %xmm7
200 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
201 ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm1
202 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7]
203 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
204 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
205 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
206 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
207 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
208 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
209 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
212 ; AVX2-LABEL: var_funnnel_v8i32:
214 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
215 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
216 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
217 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
218 ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
219 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
220 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
223 ; AVX512F-LABEL: var_funnnel_v8i32:
225 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
226 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
227 ; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1
228 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
229 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
230 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
231 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
234 ; AVX512VL-LABEL: var_funnnel_v8i32:
236 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
237 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
238 ; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1
239 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
240 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
241 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
242 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
243 ; AVX512VL-NEXT: retq
245 ; AVX512BW-LABEL: var_funnnel_v8i32:
247 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
248 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
249 ; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1
250 ; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
251 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
252 ; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
253 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
254 ; AVX512BW-NEXT: retq
256 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
257 ; AVX512VBMI2: # %bb.0:
258 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
259 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
260 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
261 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
262 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
263 ; AVX512VBMI2-NEXT: retq
265 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
266 ; AVX512VLBW: # %bb.0:
267 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
268 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
269 ; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1
270 ; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
271 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
272 ; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
273 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
274 ; AVX512VLBW-NEXT: retq
276 ; AVX10-LABEL: var_funnnel_v8i32:
278 ; AVX10-NEXT: vpshldvd %ymm2, %ymm1, %ymm0
281 ; XOPAVX1-LABEL: var_funnnel_v8i32:
283 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
284 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
285 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
286 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3
287 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [4294967265,4294967265,4294967265,4294967265]
288 ; XOPAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
289 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
290 ; XOPAVX1-NEXT: vpsrld $1, %xmm6, %xmm6
291 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm6, %xmm4
292 ; XOPAVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
293 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
294 ; XOPAVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
295 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
296 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
297 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
298 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
301 ; XOPAVX2-LABEL: var_funnnel_v8i32:
303 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
304 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
305 ; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1
306 ; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
307 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
308 ; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
309 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
311 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
315 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
316 ; AVX1-LABEL: var_funnnel_v16i16:
318 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
319 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
320 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
321 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
322 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
323 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
324 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
325 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [1065353216,1065353216,1065353216,1065353216]
326 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
327 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
328 ; AVX1-NEXT: vpmulld %xmm7, %xmm5, %xmm5
329 ; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5
330 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
331 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
332 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
333 ; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4
334 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
335 ; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
336 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
337 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
338 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
339 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4,4,5,5,6,6,7,7]
340 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
341 ; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5
342 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
343 ; AVX1-NEXT: vpmulld %xmm5, %xmm4, %xmm4
344 ; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4
345 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
346 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
347 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
348 ; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1
349 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
350 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
351 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
352 ; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
353 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
356 ; AVX2-LABEL: var_funnnel_v16i16:
358 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
359 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
360 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
361 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
362 ; AVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3
363 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
364 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
365 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
366 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
367 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
368 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
371 ; AVX512F-LABEL: var_funnnel_v16i16:
373 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
374 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
375 ; AVX512F-NEXT: vpslld $16, %zmm0, %zmm0
376 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
377 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
378 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
379 ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
380 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0
381 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
384 ; AVX512VL-LABEL: var_funnnel_v16i16:
386 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
387 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
388 ; AVX512VL-NEXT: vpslld $16, %zmm0, %zmm0
389 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
390 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
391 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
392 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
393 ; AVX512VL-NEXT: vpsrld $16, %zmm0, %zmm0
394 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
395 ; AVX512VL-NEXT: retq
397 ; AVX512BW-LABEL: var_funnnel_v16i16:
399 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
400 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
401 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
402 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
403 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
404 ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
405 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
406 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
407 ; AVX512BW-NEXT: retq
409 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
410 ; AVX512VBMI2: # %bb.0:
411 ; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
412 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
413 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
414 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
415 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
416 ; AVX512VBMI2-NEXT: retq
418 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
419 ; AVX512VLBW: # %bb.0:
420 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
421 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
422 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
423 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
424 ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
425 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
426 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
427 ; AVX512VLBW-NEXT: retq
429 ; AVX10-LABEL: var_funnnel_v16i16:
431 ; AVX10-NEXT: vpshldvw %ymm2, %ymm1, %ymm0
434 ; XOPAVX1-LABEL: var_funnnel_v16i16:
436 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
437 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
438 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
439 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3
440 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [65521,65521,65521,65521,65521,65521,65521,65521]
441 ; XOPAVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
442 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
443 ; XOPAVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
444 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm6, %xmm4
445 ; XOPAVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
446 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
447 ; XOPAVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2
448 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
449 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
450 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
451 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
454 ; XOPAVX2-LABEL: var_funnnel_v16i16:
456 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
457 ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
458 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
459 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
460 ; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
461 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
462 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
463 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
464 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
465 ; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
466 ; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
467 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
468 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
469 ; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
470 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
471 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
472 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
473 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
475 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
479 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
480 ; AVX1-LABEL: var_funnnel_v32i8:
482 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
483 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
484 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
485 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
486 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
487 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm7
488 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
489 ; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
490 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
491 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
492 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
493 ; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
494 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
495 ; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
496 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
497 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
498 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
499 ; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
500 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
501 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
502 ; AVX1-NEXT: vpslld $23, %xmm9, %xmm9
503 ; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
504 ; AVX1-NEXT: vcvttps2dq %xmm9, %xmm9
505 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
506 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
507 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
508 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
509 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
510 ; AVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
511 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
512 ; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
513 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
514 ; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
515 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
516 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
517 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
518 ; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
519 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
520 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
521 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
522 ; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
523 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
524 ; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
525 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
526 ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
527 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
528 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
529 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
530 ; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
531 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
532 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
533 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
534 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
535 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
536 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
537 ; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
538 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
539 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
540 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
541 ; AVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
542 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
545 ; AVX2-LABEL: var_funnnel_v32i8:
547 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
548 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
549 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
550 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
551 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
552 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
553 ; AVX2-NEXT: vpsllvd %ymm7, %ymm5, %ymm5
554 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
555 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
556 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
557 ; AVX2-NEXT: vpsllvd %ymm6, %ymm3, %ymm3
558 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
559 ; AVX2-NEXT: vpackusdw %ymm5, %ymm3, %ymm3
560 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
561 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
562 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
563 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
564 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
565 ; AVX2-NEXT: vpsllvd %ymm5, %ymm1, %ymm1
566 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
567 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
568 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
569 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
570 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
571 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
572 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
573 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
576 ; AVX512F-LABEL: var_funnnel_v32i8:
578 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
579 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
580 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
581 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
582 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
583 ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
584 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
585 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
586 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
587 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6
588 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
589 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
590 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
591 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6
592 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5
593 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
594 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
595 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
596 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
597 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
598 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
599 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
600 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
601 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
602 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
603 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
604 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
605 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
606 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
607 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
610 ; AVX512VL-LABEL: var_funnnel_v32i8:
612 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
613 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
614 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
615 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
616 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
617 ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
618 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
619 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
620 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
621 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
622 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
623 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
624 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
625 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
626 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
627 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
628 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
629 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
630 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
631 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
632 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
633 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
634 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
635 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
636 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
637 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
638 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
639 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
640 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
641 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
642 ; AVX512VL-NEXT: retq
644 ; AVX512BW-LABEL: var_funnnel_v32i8:
646 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
647 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
648 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
649 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
650 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
651 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
652 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
653 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
654 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
655 ; AVX512BW-NEXT: retq
657 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
658 ; AVX512VBMI2: # %bb.0:
659 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
660 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
661 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
662 ; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
663 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
664 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
665 ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0
666 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
667 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
668 ; AVX512VBMI2-NEXT: retq
670 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
671 ; AVX512VLBW: # %bb.0:
672 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
673 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
674 ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0
675 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
676 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
677 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
678 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
679 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
680 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
681 ; AVX512VLBW-NEXT: retq
683 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
684 ; AVX512VLVBMI2: # %bb.0:
685 ; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
686 ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
687 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
688 ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
689 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
690 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
691 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0
692 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
693 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
694 ; AVX512VLVBMI2-NEXT: retq
696 ; AVX10_256-LABEL: var_funnnel_v32i8:
697 ; AVX10_256: # %bb.0:
698 ; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
699 ; AVX10_256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
700 ; AVX10_256-NEXT: vpxor %xmm4, %xmm4, %xmm4
701 ; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
702 ; AVX10_256-NEXT: vpsllvw %ymm5, %ymm3, %ymm3
703 ; AVX10_256-NEXT: vpsrlw $8, %ymm3, %ymm3
704 ; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
705 ; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
706 ; AVX10_256-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
707 ; AVX10_256-NEXT: vpsrlw $8, %ymm0, %ymm0
708 ; AVX10_256-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
709 ; AVX10_256-NEXT: retq
711 ; XOPAVX1-LABEL: var_funnnel_v32i8:
713 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
714 ; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
715 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3
716 ; XOPAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
717 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
718 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
719 ; XOPAVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm7
720 ; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3
721 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
722 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm7, %xmm5
723 ; XOPAVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
724 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
725 ; XOPAVX1-NEXT: vpaddb %xmm6, %xmm2, %xmm4
726 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
727 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
728 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
729 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
732 ; XOPAVX2-LABEL: var_funnnel_v32i8:
734 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
735 ; XOPAVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
736 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3
737 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
738 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
739 ; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249]
740 ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7
741 ; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3
742 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
743 ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm2, %xmm4
744 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
745 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
746 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
747 ; XOPAVX2-NEXT: vpshlb %xmm5, %xmm3, %xmm3
748 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
749 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
750 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
752 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
757 ; Uniform Variable Shifts
760 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind {
761 ; AVX1-LABEL: splatvar_funnnel_v4i64:
763 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
764 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
765 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
766 ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
767 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
768 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
769 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
770 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
771 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
772 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
773 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
774 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
775 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
776 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
779 ; AVX2-LABEL: splatvar_funnnel_v4i64:
781 ; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
782 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
783 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
784 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
785 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
786 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
787 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
790 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
792 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
793 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
794 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
795 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
796 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
797 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
798 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
801 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
803 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
804 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
805 ; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
806 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
807 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
808 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
809 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
810 ; AVX512VL-NEXT: retq
812 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
814 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
815 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
816 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
817 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
818 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
819 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
820 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
821 ; AVX512BW-NEXT: retq
823 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
824 ; AVX512VBMI2: # %bb.0:
825 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
826 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
827 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
828 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
829 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
830 ; AVX512VBMI2-NEXT: retq
832 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
833 ; AVX512VLBW: # %bb.0:
834 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
835 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
836 ; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
837 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
838 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
839 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
840 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
841 ; AVX512VLBW-NEXT: retq
843 ; AVX10-LABEL: splatvar_funnnel_v4i64:
845 ; AVX10-NEXT: vpbroadcastq %xmm2, %ymm2
846 ; AVX10-NEXT: vpshldvq %ymm2, %ymm1, %ymm0
849 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
851 ; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
852 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
853 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
854 ; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5
855 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5
856 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
857 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
858 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
859 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
860 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
861 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
862 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
863 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
864 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
867 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
869 ; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
870 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
871 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
872 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
873 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
874 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
875 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
877 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
878 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat)
882 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind {
883 ; AVX1-LABEL: splatvar_funnnel_v8i32:
885 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
886 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
887 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
888 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
889 ; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5
890 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
891 ; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6
892 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
893 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
894 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
895 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
896 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
897 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
898 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm5[1,3],ymm0[5,7],ymm5[5,7]
901 ; AVX2-LABEL: splatvar_funnnel_v8i32:
903 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
904 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
905 ; AVX2-NEXT: vpsllq %xmm2, %ymm3, %ymm3
906 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
907 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
908 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
911 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
913 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
914 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
915 ; AVX512F-NEXT: vpsllq %xmm2, %ymm3, %ymm3
916 ; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
917 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
918 ; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
921 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
923 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
924 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
925 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm3, %ymm3
926 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
927 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
928 ; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
929 ; AVX512VL-NEXT: retq
931 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
933 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
934 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
935 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm3, %ymm3
936 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
937 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
938 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
939 ; AVX512BW-NEXT: retq
941 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
942 ; AVX512VBMI2: # %bb.0:
943 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
944 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
945 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
946 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
947 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
948 ; AVX512VBMI2-NEXT: retq
950 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
951 ; AVX512VLBW: # %bb.0:
952 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
953 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
954 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm3, %ymm3
955 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
956 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
957 ; AVX512VLBW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
958 ; AVX512VLBW-NEXT: retq
960 ; AVX10-LABEL: splatvar_funnnel_v8i32:
962 ; AVX10-NEXT: vpbroadcastd %xmm2, %ymm2
963 ; AVX10-NEXT: vpshldvd %ymm2, %ymm1, %ymm0
966 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
968 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
969 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
970 ; XOPAVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
971 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
972 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5
973 ; XOPAVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
974 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6
975 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
976 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
977 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
978 ; XOPAVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
979 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
980 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
981 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm5[1,3],ymm0[5,7],ymm5[5,7]
984 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
986 ; XOPAVX2-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
987 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
988 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm3, %ymm3
989 ; XOPAVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
990 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
991 ; XOPAVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
993 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
994 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat)
998 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
999 ; AVX1-LABEL: splatvar_funnnel_v16i16:
1001 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1002 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1003 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1004 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
1005 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
1006 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1007 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1008 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1009 ; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
1010 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1011 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1012 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1013 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1014 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1017 ; AVX2-LABEL: splatvar_funnnel_v16i16:
1019 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1020 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1021 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1022 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1023 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1024 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1025 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1028 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
1030 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1031 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
1032 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1033 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1034 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
1035 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1036 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1037 ; AVX512F-NEXT: retq
1039 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
1040 ; AVX512VL: # %bb.0:
1041 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1042 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
1043 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
1044 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1045 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
1046 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1047 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1048 ; AVX512VL-NEXT: retq
1050 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
1051 ; AVX512BW: # %bb.0:
1052 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1053 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1054 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
1055 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1056 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1057 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1058 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1059 ; AVX512BW-NEXT: retq
1061 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
1062 ; AVX512VBMI2: # %bb.0:
1063 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1064 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1065 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
1066 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
1067 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1068 ; AVX512VBMI2-NEXT: retq
1070 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
1071 ; AVX512VLBW: # %bb.0:
1072 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1073 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1074 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
1075 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1076 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1077 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1078 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1079 ; AVX512VLBW-NEXT: retq
1081 ; AVX10-LABEL: splatvar_funnnel_v16i16:
1083 ; AVX10-NEXT: vpbroadcastw %xmm2, %ymm2
1084 ; AVX10-NEXT: vpshldvw %ymm2, %ymm1, %ymm0
1087 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
1089 ; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1090 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1091 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1092 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
1093 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
1094 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1095 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1096 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1097 ; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
1098 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1099 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1100 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1101 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1102 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1103 ; XOPAVX1-NEXT: retq
1105 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
1107 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
1108 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1109 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1110 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1111 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1112 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1113 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1114 ; XOPAVX2-NEXT: retq
1115 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
1116 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat)
1120 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind {
1121 ; AVX1-LABEL: splatvar_funnnel_v32i8:
1123 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1124 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1125 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1126 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1127 ; AVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5
1128 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
1129 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1130 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1131 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1132 ; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
1133 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1134 ; AVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4
1135 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1136 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1137 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1138 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1139 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
1140 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1143 ; AVX2-LABEL: splatvar_funnnel_v32i8:
1145 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1146 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1147 ; AVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1148 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1149 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1150 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1151 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1152 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1155 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
1157 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1158 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1159 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1160 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1161 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1162 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1163 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1164 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1165 ; AVX512F-NEXT: retq
1167 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
1168 ; AVX512VL: # %bb.0:
1169 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1170 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1171 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1172 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1173 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1174 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1175 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1176 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1177 ; AVX512VL-NEXT: retq
1179 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
1180 ; AVX512BW: # %bb.0:
1181 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1182 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1183 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1184 ; AVX512BW-NEXT: vpsrlw $8, %ymm3, %ymm3
1185 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1186 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1187 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1188 ; AVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1189 ; AVX512BW-NEXT: retq
1191 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
1192 ; AVX512VBMI2: # %bb.0:
1193 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1194 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1195 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1196 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3
1197 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1198 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1199 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1200 ; AVX512VBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1201 ; AVX512VBMI2-NEXT: retq
1203 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
1204 ; AVX512VLBW: # %bb.0:
1205 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1206 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1207 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1208 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3
1209 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1210 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1211 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
1212 ; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1213 ; AVX512VLBW-NEXT: retq
1215 ; AVX10-LABEL: splatvar_funnnel_v32i8:
1217 ; AVX10-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1218 ; AVX10-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1219 ; AVX10-NEXT: vpsllw %xmm2, %ymm3, %ymm3
1220 ; AVX10-NEXT: vpsrlw $8, %ymm3, %ymm3
1221 ; AVX10-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1222 ; AVX10-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1223 ; AVX10-NEXT: vpsrlw $8, %ymm0, %ymm0
1224 ; AVX10-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1227 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
1229 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1230 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1231 ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1232 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1233 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5
1234 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1235 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1236 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1237 ; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3
1238 ; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1239 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5
1240 ; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1241 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1242 ; XOPAVX1-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0
1243 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1244 ; XOPAVX1-NEXT: retq
1246 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
1248 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1249 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
1250 ; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1251 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1252 ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm5, %xmm5
1253 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1254 ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1255 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1256 ; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3
1257 ; XOPAVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1258 ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm5, %xmm5
1259 ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1260 ; XOPAVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1261 ; XOPAVX2-NEXT: vpperm %xmm4, %xmm5, %xmm0, %xmm0
1262 ; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1263 ; XOPAVX2-NEXT: retq
1264 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1265 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat)
1269 ; Harder PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
1270 ; CGP should sink splatted select operands through the funnel shift.
1272 define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
1273 ; AVX1-LABEL: fancierRotate2:
1274 ; AVX1: # %bb.0: # %entry
1275 ; AVX1-NEXT: vmovd %edx, %xmm1
1276 ; AVX1-NEXT: vmovd %ecx, %xmm2
1277 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1278 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
1279 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [31,0]
1280 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1281 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1282 ; AVX1-NEXT: .p2align 4
1283 ; AVX1-NEXT: .LBB8_1: # %loop
1284 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
1285 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
1286 ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3
1287 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4
1288 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
1289 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
1290 ; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5
1291 ; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6
1292 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3]
1293 ; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8
1294 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
1295 ; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9
1296 ; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3]
1297 ; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3]
1298 ; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10
1299 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
1300 ; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11
1301 ; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3]
1302 ; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7
1303 ; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5
1304 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3]
1305 ; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4
1306 ; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5
1307 ; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6
1308 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3]
1309 ; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3
1310 ; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4)
1311 ; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4)
1312 ; AVX1-NEXT: addq $8, %rax
1313 ; AVX1-NEXT: jne .LBB8_1
1314 ; AVX1-NEXT: # %bb.2: # %exit
1315 ; AVX1-NEXT: vzeroupper
1318 ; AVX2-LABEL: fancierRotate2:
1319 ; AVX2: # %bb.0: # %entry
1320 ; AVX2-NEXT: vmovd %edx, %xmm0
1321 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1322 ; AVX2-NEXT: vmovd %ecx, %xmm1
1323 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1324 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1325 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1326 ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
1327 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
1328 ; AVX2-NEXT: .p2align 4
1329 ; AVX2-NEXT: .LBB8_1: # %loop
1330 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
1331 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1332 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5
1333 ; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5
1334 ; AVX2-NEXT: vandps %ymm3, %ymm5, %ymm5
1335 ; AVX2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm6
1336 ; AVX2-NEXT: vpsllvd %ymm5, %ymm6, %ymm7
1337 ; AVX2-NEXT: vpsubd %ymm5, %ymm4, %ymm5
1338 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5
1339 ; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5
1340 ; AVX2-NEXT: vmovdqu %ymm5, 4096(%rdi,%rax,4)
1341 ; AVX2-NEXT: addq $8, %rax
1342 ; AVX2-NEXT: jne .LBB8_1
1343 ; AVX2-NEXT: # %bb.2: # %exit
1344 ; AVX2-NEXT: vzeroupper
1347 ; AVX512F-LABEL: fancierRotate2:
1348 ; AVX512F: # %bb.0: # %entry
1349 ; AVX512F-NEXT: vmovd %edx, %xmm0
1350 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0
1351 ; AVX512F-NEXT: vmovd %ecx, %xmm1
1352 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
1353 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
1354 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1355 ; AVX512F-NEXT: .p2align 4
1356 ; AVX512F-NEXT: .LBB8_1: # %loop
1357 ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
1358 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1359 ; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1360 ; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1361 ; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1362 ; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1363 ; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1364 ; AVX512F-NEXT: addq $8, %rax
1365 ; AVX512F-NEXT: jne .LBB8_1
1366 ; AVX512F-NEXT: # %bb.2: # %exit
1367 ; AVX512F-NEXT: vzeroupper
1368 ; AVX512F-NEXT: retq
1370 ; AVX512VL-LABEL: fancierRotate2:
1371 ; AVX512VL: # %bb.0: # %entry
1372 ; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0
1373 ; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1
1374 ; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00
1375 ; AVX512VL-NEXT: .p2align 4
1376 ; AVX512VL-NEXT: .LBB8_1: # %loop
1377 ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
1378 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1379 ; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
1380 ; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1381 ; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1382 ; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1383 ; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1384 ; AVX512VL-NEXT: addq $8, %rax
1385 ; AVX512VL-NEXT: jne .LBB8_1
1386 ; AVX512VL-NEXT: # %bb.2: # %exit
1387 ; AVX512VL-NEXT: vzeroupper
1388 ; AVX512VL-NEXT: retq
1390 ; AVX512BW-LABEL: fancierRotate2:
1391 ; AVX512BW: # %bb.0: # %entry
1392 ; AVX512BW-NEXT: vmovd %edx, %xmm0
1393 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0
1394 ; AVX512BW-NEXT: vmovd %ecx, %xmm1
1395 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
1396 ; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
1397 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1398 ; AVX512BW-NEXT: .p2align 4
1399 ; AVX512BW-NEXT: .LBB8_1: # %loop
1400 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
1401 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1402 ; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1403 ; AVX512BW-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1404 ; AVX512BW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1405 ; AVX512BW-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1406 ; AVX512BW-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1407 ; AVX512BW-NEXT: addq $8, %rax
1408 ; AVX512BW-NEXT: jne .LBB8_1
1409 ; AVX512BW-NEXT: # %bb.2: # %exit
1410 ; AVX512BW-NEXT: vzeroupper
1411 ; AVX512BW-NEXT: retq
1413 ; AVX512VBMI2-LABEL: fancierRotate2:
1414 ; AVX512VBMI2: # %bb.0: # %entry
1415 ; AVX512VBMI2-NEXT: vmovd %edx, %xmm0
1416 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %ymm0
1417 ; AVX512VBMI2-NEXT: vmovd %ecx, %xmm1
1418 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
1419 ; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
1420 ; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1421 ; AVX512VBMI2-NEXT: .p2align 4
1422 ; AVX512VBMI2-NEXT: .LBB8_1: # %loop
1423 ; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1
1424 ; AVX512VBMI2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1425 ; AVX512VBMI2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1426 ; AVX512VBMI2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1427 ; AVX512VBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4
1428 ; AVX512VBMI2-NEXT: vprolvd %zmm3, %zmm4, %zmm3
1429 ; AVX512VBMI2-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4)
1430 ; AVX512VBMI2-NEXT: addq $8, %rax
1431 ; AVX512VBMI2-NEXT: jne .LBB8_1
1432 ; AVX512VBMI2-NEXT: # %bb.2: # %exit
1433 ; AVX512VBMI2-NEXT: vzeroupper
1434 ; AVX512VBMI2-NEXT: retq
1436 ; AVX512VLBW-LABEL: fancierRotate2:
1437 ; AVX512VLBW: # %bb.0: # %entry
1438 ; AVX512VLBW-NEXT: vpbroadcastd %edx, %ymm0
1439 ; AVX512VLBW-NEXT: vpbroadcastd %ecx, %ymm1
1440 ; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00
1441 ; AVX512VLBW-NEXT: .p2align 4
1442 ; AVX512VLBW-NEXT: .LBB8_1: # %loop
1443 ; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1
1444 ; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1445 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
1446 ; AVX512VLBW-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1447 ; AVX512VLBW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1448 ; AVX512VLBW-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1449 ; AVX512VLBW-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1450 ; AVX512VLBW-NEXT: addq $8, %rax
1451 ; AVX512VLBW-NEXT: jne .LBB8_1
1452 ; AVX512VLBW-NEXT: # %bb.2: # %exit
1453 ; AVX512VLBW-NEXT: vzeroupper
1454 ; AVX512VLBW-NEXT: retq
1456 ; AVX10-LABEL: fancierRotate2:
1457 ; AVX10: # %bb.0: # %entry
1458 ; AVX10-NEXT: vpbroadcastd %edx, %ymm0
1459 ; AVX10-NEXT: vpbroadcastd %ecx, %ymm1
1460 ; AVX10-NEXT: movq $-1024, %rax # imm = 0xFC00
1461 ; AVX10-NEXT: .p2align 4
1462 ; AVX10-NEXT: .LBB8_1: # %loop
1463 ; AVX10-NEXT: # =>This Inner Loop Header: Depth=1
1464 ; AVX10-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
1465 ; AVX10-NEXT: vptestnmb %xmm2, %xmm2, %k1
1466 ; AVX10-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
1467 ; AVX10-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3
1468 ; AVX10-NEXT: vprolvd %ymm2, %ymm3, %ymm2
1469 ; AVX10-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4)
1470 ; AVX10-NEXT: addq $8, %rax
1471 ; AVX10-NEXT: jne .LBB8_1
1472 ; AVX10-NEXT: # %bb.2: # %exit
1473 ; AVX10-NEXT: vzeroupper
1476 ; XOPAVX1-LABEL: fancierRotate2:
1477 ; XOPAVX1: # %bb.0: # %entry
1478 ; XOPAVX1-NEXT: vmovd %edx, %xmm0
1479 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1480 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1481 ; XOPAVX1-NEXT: vmovd %ecx, %xmm1
1482 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1483 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
1484 ; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1485 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1486 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1487 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1488 ; XOPAVX1-NEXT: .p2align 4
1489 ; XOPAVX1-NEXT: .LBB8_1: # %loop
1490 ; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1
1491 ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
1492 ; XOPAVX1-NEXT: vpcomeqb %xmm2, %xmm5, %xmm5
1493 ; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm6
1494 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
1495 ; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm5
1496 ; XOPAVX1-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5
1497 ; XOPAVX1-NEXT: vprotd %xmm5, 4112(%rdi,%rax,4), %xmm5
1498 ; XOPAVX1-NEXT: vblendvps %xmm6, %xmm0, %xmm1, %xmm6
1499 ; XOPAVX1-NEXT: vprotd %xmm6, 4096(%rdi,%rax,4), %xmm6
1500 ; XOPAVX1-NEXT: vmovdqu %xmm6, 4096(%rdi,%rax,4)
1501 ; XOPAVX1-NEXT: vmovdqu %xmm5, 4112(%rdi,%rax,4)
1502 ; XOPAVX1-NEXT: addq $8, %rax
1503 ; XOPAVX1-NEXT: jne .LBB8_1
1504 ; XOPAVX1-NEXT: # %bb.2: # %exit
1505 ; XOPAVX1-NEXT: vzeroupper
1506 ; XOPAVX1-NEXT: retq
1508 ; XOPAVX2-LABEL: fancierRotate2:
1509 ; XOPAVX2: # %bb.0: # %entry
1510 ; XOPAVX2-NEXT: vmovd %edx, %xmm0
1511 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1512 ; XOPAVX2-NEXT: vmovd %ecx, %xmm1
1513 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
1514 ; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1515 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1516 ; XOPAVX2-NEXT: .p2align 4
1517 ; XOPAVX2-NEXT: .LBB8_1: # %loop
1518 ; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1
1519 ; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1520 ; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
1521 ; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
1522 ; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
1523 ; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4
1524 ; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3
1525 ; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4)
1526 ; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4)
1527 ; XOPAVX2-NEXT: addq $8, %rax
1528 ; XOPAVX2-NEXT: jne .LBB8_1
1529 ; XOPAVX2-NEXT: # %bb.2: # %exit
1530 ; XOPAVX2-NEXT: vzeroupper
1531 ; XOPAVX2-NEXT: retq
1533 %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0
1534 %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
1535 %i1 = insertelement <8 x i32> undef, i32 %rot1, i32 0
1536 %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
1540 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
1541 %t0 = getelementptr inbounds i8, ptr %control, i64 %index
1542 %wide.load = load <8 x i8>, ptr %t0, align 1
1543 %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer
1544 %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1
1545 %t4 = getelementptr inbounds i32, ptr %arr, i64 %index
1546 %wide.load21 = load <8 x i32>, ptr %t4, align 4
1547 %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt)
1548 store <8 x i32> %rot, ptr %t4, align 4
1549 %index.next = add i64 %index, 8
1550 %t7 = icmp eq i64 %index.next, 1024
1551 br i1 %t7, label %exit, label %loop
1561 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
1562 ; AVX1-LABEL: constant_funnnel_v4i64:
1564 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1565 ; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm3
1566 ; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm2
1567 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1568 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm3
1569 ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1
1570 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1571 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1572 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1573 ; AVX1-NEXT: vpsllq $60, %xmm2, %xmm3
1574 ; AVX1-NEXT: vpsllq $50, %xmm2, %xmm2
1575 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1576 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1577 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1578 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1579 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1580 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1583 ; AVX2-LABEL: constant_funnnel_v4i64:
1585 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1586 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1587 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1590 ; AVX512F-LABEL: constant_funnnel_v4i64:
1592 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1593 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1594 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1595 ; AVX512F-NEXT: retq
1597 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1598 ; AVX512VL: # %bb.0:
1599 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1600 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1601 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1602 ; AVX512VL-NEXT: retq
1604 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1605 ; AVX512BW: # %bb.0:
1606 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1607 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1608 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1609 ; AVX512BW-NEXT: retq
1611 ; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1612 ; AVX512VBMI2: # %bb.0:
1613 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1614 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1615 ; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,14,50,60]
1616 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
1617 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1618 ; AVX512VBMI2-NEXT: retq
1620 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1621 ; AVX512VLBW: # %bb.0:
1622 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1623 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1624 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1625 ; AVX512VLBW-NEXT: retq
1627 ; AVX10-LABEL: constant_funnnel_v4i64:
1629 ; AVX10-NEXT: vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1632 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1634 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1635 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1636 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1637 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1638 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1639 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1640 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1641 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1642 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1643 ; XOPAVX1-NEXT: retq
1645 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1647 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1648 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1649 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1650 ; XOPAVX2-NEXT: retq
1651 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1655 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
1656 ; AVX1-LABEL: constant_funnnel_v8i32:
1658 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1659 ; AVX1-NEXT: vpsrld $21, %xmm2, %xmm3
1660 ; AVX1-NEXT: vpsrld $23, %xmm2, %xmm4
1661 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1662 ; AVX1-NEXT: vpsrld $22, %xmm2, %xmm4
1663 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
1664 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1665 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1666 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1667 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
1668 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
1669 ; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3
1670 ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm4
1671 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1672 ; AVX1-NEXT: vpsrld $26, %xmm1, %xmm4
1673 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
1674 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1675 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1676 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1677 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1678 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1681 ; AVX2-LABEL: constant_funnnel_v8i32:
1683 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1684 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1685 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1688 ; AVX512F-LABEL: constant_funnnel_v8i32:
1690 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1691 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1692 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1693 ; AVX512F-NEXT: retq
1695 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1696 ; AVX512VL: # %bb.0:
1697 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1698 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1699 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1700 ; AVX512VL-NEXT: retq
1702 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1703 ; AVX512BW: # %bb.0:
1704 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1705 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1706 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1707 ; AVX512BW-NEXT: retq
1709 ; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1710 ; AVX512VBMI2: # %bb.0:
1711 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1712 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1713 ; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11]
1714 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
1715 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1716 ; AVX512VBMI2-NEXT: retq
1718 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1719 ; AVX512VLBW: # %bb.0:
1720 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1721 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1722 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1723 ; AVX512VLBW-NEXT: retq
1725 ; AVX10-LABEL: constant_funnnel_v8i32:
1727 ; AVX10-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1730 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1732 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1733 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1734 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1735 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
1736 ; XOPAVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
1737 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1738 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1739 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1740 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1741 ; XOPAVX1-NEXT: retq
1743 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1745 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1746 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1747 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1748 ; XOPAVX2-NEXT: retq
1749 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1753 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
1754 ; AVX1-LABEL: constant_funnnel_v16i16:
1756 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1757 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
1758 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 # [512,1024,2048,4096,8192,16384,32768,u]
1759 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7]
1760 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1761 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1762 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1763 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [1,2,4,8,16,32,64,128]
1764 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1765 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,1024,2048,4096,8192,16384,32768]
1766 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1767 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1770 ; AVX2-LABEL: constant_funnnel_v16i16:
1772 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1773 ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
1774 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1775 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1776 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1777 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1780 ; AVX512F-LABEL: constant_funnnel_v16i16:
1782 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
1783 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
1784 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1785 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1786 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1787 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1788 ; AVX512F-NEXT: retq
1790 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1791 ; AVX512VL: # %bb.0:
1792 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
1793 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
1794 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1795 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1796 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1797 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1798 ; AVX512VL-NEXT: retq
1800 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1801 ; AVX512BW: # %bb.0:
1802 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1803 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1804 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1805 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
1806 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
1807 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1808 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1809 ; AVX512BW-NEXT: retq
1811 ; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1812 ; AVX512VBMI2: # %bb.0:
1813 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1814 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1815 ; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1816 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
1817 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1818 ; AVX512VBMI2-NEXT: retq
1820 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1821 ; AVX512VLBW: # %bb.0:
1822 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1823 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
1824 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1825 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1826 ; AVX512VLBW-NEXT: retq
1828 ; AVX10-LABEL: constant_funnnel_v16i16:
1830 ; AVX10-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1833 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1835 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1836 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1837 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1838 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1839 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
1840 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1841 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1842 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
1843 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1844 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1845 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1846 ; XOPAVX1-NEXT: retq
1848 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1850 ; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
1851 ; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
1852 ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15]
1853 ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1854 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1855 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1856 ; XOPAVX2-NEXT: retq
1857 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1861 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
1862 ; AVX1-LABEL: constant_funnnel_v32i8:
1864 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1865 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1866 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1867 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
1868 ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1869 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1870 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1871 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
1872 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1873 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1874 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
1875 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1876 ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
1877 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
1878 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1879 ; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
1880 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1881 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
1882 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1885 ; AVX2-LABEL: constant_funnnel_v32i8:
1887 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1888 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1889 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
1890 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1891 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1892 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1893 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1896 ; AVX512F-LABEL: constant_funnnel_v32i8:
1898 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1899 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1900 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
1901 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1902 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1903 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1904 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1905 ; AVX512F-NEXT: retq
1907 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1908 ; AVX512VL: # %bb.0:
1909 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1910 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1911 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1912 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1913 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1914 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1915 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1916 ; AVX512VL-NEXT: retq
1918 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1919 ; AVX512BW: # %bb.0:
1920 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1921 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1922 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
1923 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1924 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1925 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
1926 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1927 ; AVX512BW-NEXT: retq
1929 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
1930 ; AVX512VBMI2: # %bb.0:
1931 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1932 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1933 ; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
1934 ; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm2
1935 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1936 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
1937 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1938 ; AVX512VBMI2-NEXT: retq
1940 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1941 ; AVX512VLBW: # %bb.0:
1942 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1943 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1944 ; AVX512VLBW-NEXT: vpsllw $8, %zmm0, %zmm0
1945 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1946 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1947 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
1948 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1949 ; AVX512VLBW-NEXT: retq
1951 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
1952 ; AVX512VLVBMI2: # %bb.0:
1953 ; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
1954 ; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1955 ; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
1956 ; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm2
1957 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
1958 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
1959 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1960 ; AVX512VLVBMI2-NEXT: retq
1962 ; AVX10_256-LABEL: constant_funnnel_v32i8:
1963 ; AVX10_256: # %bb.0:
1964 ; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
1965 ; AVX10_256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1966 ; AVX10_256-NEXT: vpsrlw $8, %ymm2, %ymm2
1967 ; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
1968 ; AVX10_256-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1969 ; AVX10_256-NEXT: vpsrlw $8, %ymm0, %ymm0
1970 ; AVX10_256-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1971 ; AVX10_256-NEXT: retq
1973 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1975 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1976 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1977 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2
1978 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1979 ; XOPAVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1980 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm4, %xmm4
1981 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
1982 ; XOPAVX1-NEXT: vpshlb %xmm6, %xmm4, %xmm4
1983 ; XOPAVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
1984 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
1985 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1
1986 ; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm1
1987 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1988 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1989 ; XOPAVX1-NEXT: retq
1991 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1993 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1994 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1995 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
1996 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0
1997 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1998 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1999 ; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
2000 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2
2001 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250]
2002 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm2, %xmm2
2003 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
2004 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
2005 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2006 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2007 ; XOPAVX2-NEXT: retq
2008 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2013 ; Uniform Constant Shifts
2016 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
2017 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
2019 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm2
2020 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2021 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm1
2022 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2023 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2
2024 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2025 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
2026 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2027 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2030 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
2032 ; AVX2-NEXT: vpsrlq $50, %ymm1, %ymm1
2033 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
2034 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2037 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
2039 ; AVX512F-NEXT: vpsrlq $50, %ymm1, %ymm1
2040 ; AVX512F-NEXT: vpsllq $14, %ymm0, %ymm0
2041 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2042 ; AVX512F-NEXT: retq
2044 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
2045 ; AVX512VL: # %bb.0:
2046 ; AVX512VL-NEXT: vpsrlq $50, %ymm1, %ymm1
2047 ; AVX512VL-NEXT: vpsllq $14, %ymm0, %ymm0
2048 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2049 ; AVX512VL-NEXT: retq
2051 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
2052 ; AVX512BW: # %bb.0:
2053 ; AVX512BW-NEXT: vpsrlq $50, %ymm1, %ymm1
2054 ; AVX512BW-NEXT: vpsllq $14, %ymm0, %ymm0
2055 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2056 ; AVX512BW-NEXT: retq
2058 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
2059 ; AVX512VBMI2: # %bb.0:
2060 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2061 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2062 ; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
2063 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2064 ; AVX512VBMI2-NEXT: retq
2066 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
2067 ; AVX512VLBW: # %bb.0:
2068 ; AVX512VLBW-NEXT: vpsrlq $50, %ymm1, %ymm1
2069 ; AVX512VLBW-NEXT: vpsllq $14, %ymm0, %ymm0
2070 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2071 ; AVX512VLBW-NEXT: retq
2073 ; AVX10-LABEL: splatconstant_funnnel_v4i64:
2075 ; AVX10-NEXT: vpshldq $14, %ymm1, %ymm0, %ymm0
2078 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
2080 ; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm2
2081 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2082 ; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm1
2083 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2084 ; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm2
2085 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2086 ; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm0
2087 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2088 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2089 ; XOPAVX1-NEXT: retq
2091 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
2093 ; XOPAVX2-NEXT: vpsrlq $50, %ymm1, %ymm1
2094 ; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm0
2095 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2096 ; XOPAVX2-NEXT: retq
2097 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
2101 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
2102 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
2104 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2105 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2106 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
2107 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2108 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm2
2109 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2110 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
2111 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2112 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2115 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
2117 ; AVX2-NEXT: vpsrld $28, %ymm1, %ymm1
2118 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
2119 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2122 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
2124 ; AVX512F-NEXT: vpsrld $28, %ymm1, %ymm1
2125 ; AVX512F-NEXT: vpslld $4, %ymm0, %ymm0
2126 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2127 ; AVX512F-NEXT: retq
2129 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
2130 ; AVX512VL: # %bb.0:
2131 ; AVX512VL-NEXT: vpsrld $28, %ymm1, %ymm1
2132 ; AVX512VL-NEXT: vpslld $4, %ymm0, %ymm0
2133 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2134 ; AVX512VL-NEXT: retq
2136 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
2137 ; AVX512BW: # %bb.0:
2138 ; AVX512BW-NEXT: vpsrld $28, %ymm1, %ymm1
2139 ; AVX512BW-NEXT: vpslld $4, %ymm0, %ymm0
2140 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2141 ; AVX512BW-NEXT: retq
2143 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
2144 ; AVX512VBMI2: # %bb.0:
2145 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2146 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2147 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
2148 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2149 ; AVX512VBMI2-NEXT: retq
2151 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
2152 ; AVX512VLBW: # %bb.0:
2153 ; AVX512VLBW-NEXT: vpsrld $28, %ymm1, %ymm1
2154 ; AVX512VLBW-NEXT: vpslld $4, %ymm0, %ymm0
2155 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2156 ; AVX512VLBW-NEXT: retq
2158 ; AVX10-LABEL: splatconstant_funnnel_v8i32:
2160 ; AVX10-NEXT: vpshldd $4, %ymm1, %ymm0, %ymm0
2163 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
2165 ; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm2
2166 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2167 ; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm1
2168 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2169 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm2
2170 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2171 ; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm0
2172 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2173 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2174 ; XOPAVX1-NEXT: retq
2176 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
2178 ; XOPAVX2-NEXT: vpsrld $28, %ymm1, %ymm1
2179 ; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm0
2180 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2181 ; XOPAVX2-NEXT: retq
2182 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
2186 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
2187 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
2189 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
2190 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2191 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm1
2192 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2193 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm2
2194 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2195 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
2196 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2197 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2200 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
2202 ; AVX2-NEXT: vpsrlw $9, %ymm1, %ymm1
2203 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
2204 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2207 ; AVX512F-LABEL: splatconstant_funnnel_v16i16:
2209 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
2210 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
2211 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
2212 ; AVX512F-NEXT: retq
2214 ; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
2215 ; AVX512VL: # %bb.0:
2216 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
2217 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
2218 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
2219 ; AVX512VL-NEXT: retq
2221 ; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
2222 ; AVX512BW: # %bb.0:
2223 ; AVX512BW-NEXT: vpsrlw $9, %ymm1, %ymm1
2224 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
2225 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2226 ; AVX512BW-NEXT: retq
2228 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
2229 ; AVX512VBMI2: # %bb.0:
2230 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
2231 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2232 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
2233 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2234 ; AVX512VBMI2-NEXT: retq
2236 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
2237 ; AVX512VLBW: # %bb.0:
2238 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm1, %ymm1
2239 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
2240 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2241 ; AVX512VLBW-NEXT: retq
2243 ; AVX10-LABEL: splatconstant_funnnel_v16i16:
2245 ; AVX10-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0
2248 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
2250 ; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
2251 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2252 ; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm1
2253 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2254 ; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm2
2255 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2256 ; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm0
2257 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2258 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2259 ; XOPAVX1-NEXT: retq
2261 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
2263 ; XOPAVX2-NEXT: vpsrlw $9, %ymm1, %ymm1
2264 ; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm0
2265 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2266 ; XOPAVX2-NEXT: retq
2267 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2271 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
2272 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
2274 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2275 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2276 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2277 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2278 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2279 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
2280 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2281 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2282 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
2283 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
2284 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2285 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
2286 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2287 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2288 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
2291 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
2293 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2294 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2295 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
2296 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2297 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
2300 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
2302 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
2303 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0
2304 ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2305 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2306 ; AVX512F-NEXT: retq
2308 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
2309 ; AVX512VL: # %bb.0:
2310 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
2311 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0
2312 ; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
2313 ; AVX512VL-NEXT: retq
2315 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
2316 ; AVX512BW: # %bb.0:
2317 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm2
2318 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm0
2319 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2320 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2321 ; AVX512BW-NEXT: retq
2323 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
2324 ; AVX512VBMI2: # %bb.0:
2325 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm2
2326 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0
2327 ; AVX512VBMI2-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2328 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2329 ; AVX512VBMI2-NEXT: retq
2331 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
2332 ; AVX512VLBW: # %bb.0:
2333 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2
2334 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0
2335 ; AVX512VLBW-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
2336 ; AVX512VLBW-NEXT: retq
2338 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
2340 ; AVX10-NEXT: vpsllw $4, %ymm0, %ymm2
2341 ; AVX10-NEXT: vpsrlw $4, %ymm1, %ymm0
2342 ; AVX10-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
2345 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
2347 ; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
2348 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2349 ; XOPAVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2350 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2351 ; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm2
2352 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2353 ; XOPAVX1-NEXT: vpsllw $4, %xmm0, %xmm0
2354 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2355 ; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0
2356 ; XOPAVX1-NEXT: retq
2358 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
2360 ; XOPAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2361 ; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm0
2362 ; XOPAVX2-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0
2363 ; XOPAVX2-NEXT: retq
2364 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)