1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
13 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
14 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
15 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
16 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
22 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
23 ; AVX1-LABEL: var_funnnel_v4i64:
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
27 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
28 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
30 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
32 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
34 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
36 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
37 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
38 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
39 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
40 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
41 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
42 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7
43 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
44 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
45 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
46 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
47 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
48 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4
49 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
50 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
51 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
52 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
53 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
56 ; AVX2-LABEL: var_funnnel_v4i64:
58 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
59 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
60 ; AVX2-NEXT: vpsllvq %ymm3, %ymm0, %ymm3
61 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
62 ; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
63 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
64 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
65 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
68 ; AVX512F-LABEL: var_funnnel_v4i64:
70 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
71 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
72 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
73 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
76 ; AVX512VL-LABEL: var_funnnel_v4i64:
78 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
81 ; AVX512BW-LABEL: var_funnnel_v4i64:
83 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
84 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
85 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
86 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
89 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
90 ; AVX512VLBW: # %bb.0:
91 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
92 ; AVX512VLBW-NEXT: retq
94 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
95 ; AVX512VBMI2: # %bb.0:
96 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
97 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
98 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
99 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
100 ; AVX512VBMI2-NEXT: retq
102 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
103 ; AVX512VLVBMI2: # %bb.0:
104 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
105 ; AVX512VLVBMI2-NEXT: retq
107 ; XOPAVX1-LABEL: var_funnnel_v4i64:
109 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
110 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
111 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
112 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
113 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
116 ; XOPAVX2-LABEL: var_funnnel_v4i64:
118 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
119 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
120 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
121 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
122 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
124 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
128 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
129 ; AVX1-LABEL: var_funnnel_v8i32:
131 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
132 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
133 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
134 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
135 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
136 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
137 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
138 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
139 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
140 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
141 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
142 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
143 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
145 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
146 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
147 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
148 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
149 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
150 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
151 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
152 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
153 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
154 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
155 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
156 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
157 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
158 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
159 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
160 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
161 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
164 ; AVX2-LABEL: var_funnnel_v8i32:
166 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
167 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
168 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
169 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
170 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
171 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
172 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
175 ; AVX512F-LABEL: var_funnnel_v8i32:
177 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
178 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
179 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
180 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
183 ; AVX512VL-LABEL: var_funnnel_v8i32:
185 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
186 ; AVX512VL-NEXT: retq
188 ; AVX512BW-LABEL: var_funnnel_v8i32:
190 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
191 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
192 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
193 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
194 ; AVX512BW-NEXT: retq
196 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
197 ; AVX512VLBW: # %bb.0:
198 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
199 ; AVX512VLBW-NEXT: retq
201 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
202 ; AVX512VBMI2: # %bb.0:
203 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
204 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
205 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
206 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
207 ; AVX512VBMI2-NEXT: retq
209 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
210 ; AVX512VLVBMI2: # %bb.0:
211 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
212 ; AVX512VLVBMI2-NEXT: retq
214 ; XOPAVX1-LABEL: var_funnnel_v8i32:
216 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
217 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
218 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
219 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
220 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
223 ; XOPAVX2-LABEL: var_funnnel_v8i32:
225 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
226 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
227 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
228 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
229 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
231 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
235 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
236 ; AVX1-LABEL: var_funnnel_v16i16:
238 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
239 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
240 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
241 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
242 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
243 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
244 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
245 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
246 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
247 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
248 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
249 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
250 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
251 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
252 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
253 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
254 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
255 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
256 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
257 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
258 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
259 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
260 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
261 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
262 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
263 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
264 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
265 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
266 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
267 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
268 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
271 ; AVX2-LABEL: var_funnnel_v16i16:
273 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
274 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
275 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
276 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
277 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
278 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
279 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
280 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
281 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
282 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
283 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
284 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
285 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
286 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
287 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
288 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
289 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
290 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
291 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
292 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
293 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
296 ; AVX512F-LABEL: var_funnnel_v16i16:
298 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
299 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
300 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
301 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
302 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
303 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
304 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
305 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
306 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
307 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
310 ; AVX512VL-LABEL: var_funnnel_v16i16:
312 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
313 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
314 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
315 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
316 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
317 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
318 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
319 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
320 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
321 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
322 ; AVX512VL-NEXT: retq
324 ; AVX512BW-LABEL: var_funnnel_v16i16:
326 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
327 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
328 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
329 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
330 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
331 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
332 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
333 ; AVX512BW-NEXT: retq
335 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
336 ; AVX512VLBW: # %bb.0:
337 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
338 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
339 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
340 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
341 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
342 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
343 ; AVX512VLBW-NEXT: retq
345 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
346 ; AVX512VBMI2: # %bb.0:
347 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
348 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
349 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
350 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
351 ; AVX512VBMI2-NEXT: retq
353 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
354 ; AVX512VLVBMI2: # %bb.0:
355 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
356 ; AVX512VLVBMI2-NEXT: retq
358 ; XOPAVX1-LABEL: var_funnnel_v16i16:
360 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
361 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
362 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
363 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
364 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
367 ; XOPAVX2-LABEL: var_funnnel_v16i16:
369 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
370 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
371 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
372 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
373 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
375 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
379 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
380 ; AVX1-LABEL: var_funnnel_v32i8:
382 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
383 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
384 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
385 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
386 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
387 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
388 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
389 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
390 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
391 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
392 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
393 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
394 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
395 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
396 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
397 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
398 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
399 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
400 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
401 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
402 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
403 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
404 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
405 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
406 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
407 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
408 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
409 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
410 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
411 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
412 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
413 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
414 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
415 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
416 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
417 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
418 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
419 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
420 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
421 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
422 ; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
423 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
424 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
425 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
426 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
427 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
430 ; AVX2-LABEL: var_funnnel_v32i8:
432 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
433 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
434 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
435 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
436 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
437 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
438 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
439 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
440 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
441 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
442 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
443 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
444 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
445 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
446 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
447 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
448 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
449 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
450 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
451 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
454 ; AVX512F-LABEL: var_funnnel_v32i8:
456 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
457 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
458 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
459 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
460 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
461 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
462 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
463 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
464 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
465 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
466 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
467 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
468 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
469 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
470 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
471 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
472 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
473 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
474 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
475 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
478 ; AVX512VL-LABEL: var_funnnel_v32i8:
480 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
481 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
482 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
483 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
484 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
485 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
486 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
487 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
488 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
489 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
490 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
491 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
492 ; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
493 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
494 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
495 ; AVX512VL-NEXT: retq
497 ; AVX512BW-LABEL: var_funnnel_v32i8:
499 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
500 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3
501 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
502 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
503 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
504 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
505 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
506 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
507 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
508 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
509 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
510 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
511 ; AVX512BW-NEXT: retq
513 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
514 ; AVX512VLBW: # %bb.0:
515 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
516 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3
517 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
518 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
519 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
520 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
521 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
522 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
523 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
524 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
525 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
526 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
527 ; AVX512VLBW-NEXT: retq
529 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
530 ; AVX512VBMI2: # %bb.0:
531 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
532 ; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
533 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
534 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
535 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
536 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
537 ; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
538 ; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
539 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
540 ; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
541 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
542 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
543 ; AVX512VBMI2-NEXT: retq
545 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
546 ; AVX512VLVBMI2: # %bb.0:
547 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
548 ; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
549 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
550 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
551 ; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
552 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
553 ; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
554 ; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
555 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
556 ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
557 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
558 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
559 ; AVX512VLVBMI2-NEXT: retq
561 ; XOPAVX1-LABEL: var_funnnel_v32i8:
563 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
564 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
565 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
566 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
567 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
570 ; XOPAVX2-LABEL: var_funnnel_v32i8:
572 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
573 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
574 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
575 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
576 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
578 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
583 ; Uniform Variable Shifts
586 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
587 ; AVX1-LABEL: splatvar_funnnel_v4i64:
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
590 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
591 ; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
592 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
593 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
594 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
595 ; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5
596 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
597 ; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7
598 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
599 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2
600 ; AVX1-NEXT: vpsrlq %xmm6, %xmm0, %xmm6
601 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
602 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
603 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
604 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm3
605 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
606 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
607 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
610 ; AVX2-LABEL: splatvar_funnnel_v4i64:
612 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
613 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
614 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3
615 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
616 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
617 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
618 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
619 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
622 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
624 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
625 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
626 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
627 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
630 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
632 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
633 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
634 ; AVX512VL-NEXT: retq
636 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
638 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
639 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
640 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
641 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
642 ; AVX512BW-NEXT: retq
644 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
645 ; AVX512VLBW: # %bb.0:
646 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
647 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
648 ; AVX512VLBW-NEXT: retq
650 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
651 ; AVX512VBMI2: # %bb.0:
652 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
653 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
654 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
655 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
656 ; AVX512VBMI2-NEXT: retq
658 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
659 ; AVX512VLVBMI2: # %bb.0:
660 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
661 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
662 ; AVX512VLVBMI2-NEXT: retq
664 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
666 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
667 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
668 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
669 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
670 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
673 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
675 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
676 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
677 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
678 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
679 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
681 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
682 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
686 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
687 ; AVX1-LABEL: splatvar_funnnel_v8i32:
689 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
690 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
691 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
692 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
693 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
694 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
695 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
696 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
697 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
698 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
699 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
700 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
701 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
704 ; AVX2-LABEL: splatvar_funnnel_v8i32:
706 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
707 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
708 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
709 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
710 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
711 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
712 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
713 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
714 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
717 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
719 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
720 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
721 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
722 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
725 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
727 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
728 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
729 ; AVX512VL-NEXT: retq
731 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
733 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
734 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
735 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
736 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
737 ; AVX512BW-NEXT: retq
739 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
740 ; AVX512VLBW: # %bb.0:
741 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
742 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
743 ; AVX512VLBW-NEXT: retq
745 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
746 ; AVX512VBMI2: # %bb.0:
747 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
748 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
749 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
750 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
751 ; AVX512VBMI2-NEXT: retq
753 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
754 ; AVX512VLVBMI2: # %bb.0:
755 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
756 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
757 ; AVX512VLVBMI2-NEXT: retq
759 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
761 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
762 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
763 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
764 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
765 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
768 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
770 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
771 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
772 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
773 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
774 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
776 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
777 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
781 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
782 ; AVX1-LABEL: splatvar_funnnel_v16i16:
784 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
785 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
786 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
787 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
788 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
789 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
790 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
791 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
792 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
793 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
794 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
795 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
796 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
799 ; AVX2-LABEL: splatvar_funnnel_v16i16:
801 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
802 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
803 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
804 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
805 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
806 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
807 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
808 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
811 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
813 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
814 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
815 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm2
816 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
817 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
818 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
819 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
820 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
823 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
825 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
826 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
827 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm2
828 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
829 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
830 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
831 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
832 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
833 ; AVX512VL-NEXT: retq
835 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
837 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
838 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
839 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm2
840 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
841 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
842 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
843 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
844 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
845 ; AVX512BW-NEXT: retq
847 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
848 ; AVX512VLBW: # %bb.0:
849 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
850 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
851 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm2
852 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
853 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
854 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
855 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
856 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
857 ; AVX512VLBW-NEXT: retq
859 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
860 ; AVX512VBMI2: # %bb.0:
861 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
862 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
863 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
864 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
865 ; AVX512VBMI2-NEXT: retq
867 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
868 ; AVX512VLVBMI2: # %bb.0:
869 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
870 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
871 ; AVX512VLVBMI2-NEXT: retq
873 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
875 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
876 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
877 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
878 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
879 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
880 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
883 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
885 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
886 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
887 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
888 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
889 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
891 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
892 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
896 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
897 ; AVX1-LABEL: splatvar_funnnel_v32i8:
899 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
900 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
901 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
902 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
903 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
904 ; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm6
905 ; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
906 ; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
907 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
908 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
909 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
910 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
911 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
912 ; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
913 ; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
914 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
915 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
916 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
917 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
918 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
919 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
920 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
921 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
924 ; AVX2-LABEL: splatvar_funnnel_v32i8:
926 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
927 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
928 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
929 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
930 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
931 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
932 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
933 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
934 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
935 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
936 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
937 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
938 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
939 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
940 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
941 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
944 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
946 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
947 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
948 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
949 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
950 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
951 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
952 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
953 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
954 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
955 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
956 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
957 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
958 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
959 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
960 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
961 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
964 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
966 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
967 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
968 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
969 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
970 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
971 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
972 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
973 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
974 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
975 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
976 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
977 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
978 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
979 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
980 ; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
981 ; AVX512VL-NEXT: retq
983 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
985 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
986 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
987 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
988 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
989 ; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
990 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
991 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
992 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
993 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
994 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
995 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
996 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
997 ; AVX512BW-NEXT: retq
999 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
1000 ; AVX512VLBW: # %bb.0:
1001 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1002 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
1003 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1004 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1005 ; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
1006 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
1007 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1008 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
1009 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1010 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1011 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
1012 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1013 ; AVX512VLBW-NEXT: retq
1015 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
1016 ; AVX512VBMI2: # %bb.0:
1017 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1018 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
1019 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1020 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1021 ; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
1022 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1023 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1024 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
1025 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1026 ; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1027 ; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
1028 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1029 ; AVX512VBMI2-NEXT: retq
1031 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
1032 ; AVX512VLVBMI2: # %bb.0:
1033 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1034 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
1035 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1036 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1037 ; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
1038 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1039 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1040 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
1041 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1042 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
1043 ; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
1044 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1045 ; AVX512VLVBMI2-NEXT: retq
1047 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
1049 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1050 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1051 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1052 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
1053 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
1054 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1055 ; XOPAVX1-NEXT: retq
1057 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
1059 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1060 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1061 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
1062 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
1063 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1064 ; XOPAVX2-NEXT: retq
1065 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1066 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
1074 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
1075 ; AVX1-LABEL: constant_funnnel_v4i64:
1077 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1078 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2
1079 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3
1080 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1081 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
1082 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4
1083 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1084 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1085 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3
1086 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1
1087 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1088 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1089 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1090 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1091 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1092 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
1095 ; AVX2-LABEL: constant_funnnel_v4i64:
1097 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1098 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1099 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1102 ; AVX512F-LABEL: constant_funnnel_v4i64:
1104 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1105 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1106 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1107 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1108 ; AVX512F-NEXT: retq
1110 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1111 ; AVX512VL: # %bb.0:
1112 ; AVX512VL-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1113 ; AVX512VL-NEXT: retq
1115 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1116 ; AVX512BW: # %bb.0:
1117 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1118 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1119 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1120 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1121 ; AVX512BW-NEXT: retq
1123 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1124 ; AVX512VLBW: # %bb.0:
1125 ; AVX512VLBW-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1126 ; AVX512VLBW-NEXT: retq
1128 ; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1129 ; AVX512VBMI2: # %bb.0:
1130 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1131 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1132 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1133 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1134 ; AVX512VBMI2-NEXT: retq
1136 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
1137 ; AVX512VLVBMI2: # %bb.0:
1138 ; AVX512VLVBMI2-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1139 ; AVX512VLVBMI2-NEXT: retq
1141 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1143 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1144 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1145 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1146 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1147 ; XOPAVX1-NEXT: retq
1149 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1151 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1152 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1153 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1154 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1155 ; XOPAVX2-NEXT: retq
1156 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1160 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
1161 ; AVX1-LABEL: constant_funnnel_v8i32:
1163 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
1164 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1165 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1166 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1167 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
1168 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
1169 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1170 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1171 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1172 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1173 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1174 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
1175 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1176 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1177 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
1178 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1179 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1180 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1181 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1182 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1183 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1184 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1187 ; AVX2-LABEL: constant_funnnel_v8i32:
1189 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1190 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1191 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1194 ; AVX512F-LABEL: constant_funnnel_v8i32:
1196 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1197 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1198 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1199 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1200 ; AVX512F-NEXT: retq
1202 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1203 ; AVX512VL: # %bb.0:
1204 ; AVX512VL-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1205 ; AVX512VL-NEXT: retq
1207 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1208 ; AVX512BW: # %bb.0:
1209 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1210 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1211 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1212 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1213 ; AVX512BW-NEXT: retq
1215 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1216 ; AVX512VLBW: # %bb.0:
1217 ; AVX512VLBW-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1218 ; AVX512VLBW-NEXT: retq
1220 ; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1221 ; AVX512VBMI2: # %bb.0:
1222 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1223 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1224 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1225 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1226 ; AVX512VBMI2-NEXT: retq
1228 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
1229 ; AVX512VLVBMI2: # %bb.0:
1230 ; AVX512VLVBMI2-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1231 ; AVX512VLVBMI2-NEXT: retq
1233 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1235 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1236 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1237 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1238 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1239 ; XOPAVX1-NEXT: retq
1241 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1243 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1244 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1245 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1246 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1247 ; XOPAVX2-NEXT: retq
1248 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1252 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1253 ; AVX1-LABEL: constant_funnnel_v16i16:
1255 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1256 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1257 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1258 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1259 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1260 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1261 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1262 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1263 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1264 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1267 ; AVX2-LABEL: constant_funnnel_v16i16:
1269 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1270 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1271 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1272 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1275 ; AVX512F-LABEL: constant_funnnel_v16i16:
1277 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1278 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1279 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1280 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1281 ; AVX512F-NEXT: retq
1283 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1284 ; AVX512VL: # %bb.0:
1285 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1286 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1287 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1288 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1289 ; AVX512VL-NEXT: retq
1291 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1292 ; AVX512BW: # %bb.0:
1293 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1294 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1295 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1296 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1297 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1298 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1299 ; AVX512BW-NEXT: retq
1301 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1302 ; AVX512VLBW: # %bb.0:
1303 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1304 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1305 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1306 ; AVX512VLBW-NEXT: retq
1308 ; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1309 ; AVX512VBMI2: # %bb.0:
1310 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1311 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1312 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
1313 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1314 ; AVX512VBMI2-NEXT: retq
1316 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
1317 ; AVX512VLVBMI2: # %bb.0:
1318 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1319 ; AVX512VLVBMI2-NEXT: retq
1321 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1323 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1324 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1325 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1326 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1327 ; XOPAVX1-NEXT: retq
1329 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1331 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1332 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1333 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1334 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1335 ; XOPAVX2-NEXT: retq
1336 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1340 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1341 ; AVX1-LABEL: constant_funnnel_v32i8:
1343 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1344 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1345 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1346 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1347 ; AVX1-NEXT: vpmullw %xmm3, %xmm9, %xmm3
1348 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1349 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1350 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1351 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1352 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1353 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1354 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1355 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1356 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1357 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1358 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1359 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1360 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1361 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1362 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1363 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1364 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1365 ; AVX1-NEXT: vpmullw %xmm3, %xmm9, %xmm3
1366 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1367 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1368 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1369 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1370 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1371 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1372 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1373 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1374 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1375 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1376 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1377 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1378 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1381 ; AVX2-LABEL: constant_funnnel_v32i8:
1383 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1384 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1385 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1386 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1387 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1388 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1389 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1390 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1391 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1392 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1393 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1394 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1395 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1396 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1397 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1398 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1399 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1400 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1401 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1402 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1403 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1406 ; AVX512F-LABEL: constant_funnnel_v32i8:
1408 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1409 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1410 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1411 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1412 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1413 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1414 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1415 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1416 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1417 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1418 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1419 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1420 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1421 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1422 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1423 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1424 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1425 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1426 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1427 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1428 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1429 ; AVX512F-NEXT: retq
1431 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1432 ; AVX512VL: # %bb.0:
1433 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1434 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1435 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1436 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1437 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1438 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1439 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1440 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1441 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1442 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1443 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1444 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1445 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1446 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1447 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1448 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
1449 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1450 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1451 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1452 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1453 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1454 ; AVX512VL-NEXT: retq
1456 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1457 ; AVX512BW: # %bb.0:
1458 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1459 ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1460 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1461 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1462 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1463 ; AVX512BW-NEXT: retq
1465 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1466 ; AVX512VLBW: # %bb.0:
1467 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1468 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1469 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1470 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1471 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1472 ; AVX512VLBW-NEXT: retq
1474 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
1475 ; AVX512VBMI2: # %bb.0:
1476 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1477 ; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1478 ; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1479 ; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1480 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1481 ; AVX512VBMI2-NEXT: retq
1483 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
1484 ; AVX512VLVBMI2: # %bb.0:
1485 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1486 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1487 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1488 ; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
1489 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
1490 ; AVX512VLVBMI2-NEXT: retq
1492 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1494 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1495 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1496 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1497 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1498 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1499 ; XOPAVX1-NEXT: retq
1501 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1503 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1504 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1505 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1506 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1507 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1508 ; XOPAVX2-NEXT: retq
1509 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1514 ; Uniform Constant Shifts
1517 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1518 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
1520 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1
1521 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1522 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3
1523 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1524 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
1525 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2
1526 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1527 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1530 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
1532 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1
1533 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
1534 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1537 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1539 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1540 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1541 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1542 ; AVX512F-NEXT: retq
1544 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1545 ; AVX512VL: # %bb.0:
1546 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1547 ; AVX512VL-NEXT: retq
1549 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1550 ; AVX512BW: # %bb.0:
1551 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1552 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1553 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1554 ; AVX512BW-NEXT: retq
1556 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1557 ; AVX512VLBW: # %bb.0:
1558 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1559 ; AVX512VLBW-NEXT: retq
1561 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
1562 ; AVX512VBMI2: # %bb.0:
1563 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1564 ; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
1565 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1566 ; AVX512VBMI2-NEXT: retq
1568 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
1569 ; AVX512VLVBMI2: # %bb.0:
1570 ; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
1571 ; AVX512VLVBMI2-NEXT: retq
1573 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1575 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1576 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1577 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1578 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1579 ; XOPAVX1-NEXT: retq
1581 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1583 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1584 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1585 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1586 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1587 ; XOPAVX2-NEXT: retq
1588 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1592 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1593 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
1595 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1596 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1597 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1598 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1599 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1600 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1601 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1602 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1605 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
1607 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1608 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1609 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1612 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1614 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1615 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1616 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1617 ; AVX512F-NEXT: retq
1619 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1620 ; AVX512VL: # %bb.0:
1621 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1622 ; AVX512VL-NEXT: retq
1624 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1625 ; AVX512BW: # %bb.0:
1626 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1627 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1628 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1629 ; AVX512BW-NEXT: retq
1631 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1632 ; AVX512VLBW: # %bb.0:
1633 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1634 ; AVX512VLBW-NEXT: retq
1636 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
1637 ; AVX512VBMI2: # %bb.0:
1638 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1639 ; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
1640 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1641 ; AVX512VBMI2-NEXT: retq
1643 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
1644 ; AVX512VLVBMI2: # %bb.0:
1645 ; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
1646 ; AVX512VLVBMI2-NEXT: retq
1648 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1650 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1651 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1652 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1653 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1654 ; XOPAVX1-NEXT: retq
1656 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1658 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1659 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1660 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1661 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1662 ; XOPAVX2-NEXT: retq
1663 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1667 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1668 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
1670 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1671 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1672 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1673 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1674 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1675 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1676 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1677 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1680 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
1682 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1683 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1684 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1687 ; AVX512F-LABEL: splatconstant_funnnel_v16i16:
1689 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1690 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1691 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1692 ; AVX512F-NEXT: retq
1694 ; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
1695 ; AVX512VL: # %bb.0:
1696 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1697 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1698 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1699 ; AVX512VL-NEXT: retq
1701 ; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
1702 ; AVX512BW: # %bb.0:
1703 ; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1
1704 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
1705 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1706 ; AVX512BW-NEXT: retq
1708 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
1709 ; AVX512VLBW: # %bb.0:
1710 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1
1711 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
1712 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1713 ; AVX512VLBW-NEXT: retq
1715 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
1716 ; AVX512VBMI2: # %bb.0:
1717 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1718 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1719 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1720 ; AVX512VBMI2-NEXT: retq
1722 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
1723 ; AVX512VLVBMI2: # %bb.0:
1724 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0
1725 ; AVX512VLVBMI2-NEXT: retq
1727 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1729 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1730 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1731 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1732 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1733 ; XOPAVX1-NEXT: retq
1735 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1737 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1738 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1739 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1740 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1741 ; XOPAVX2-NEXT: retq
1742 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1746 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1747 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
1749 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1750 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1751 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1752 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1753 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1754 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1755 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1756 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1757 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1758 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1759 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1760 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1761 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1764 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
1766 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1767 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1768 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1769 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1770 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1773 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1775 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1776 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1777 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1778 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1779 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1780 ; AVX512F-NEXT: retq
1782 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1783 ; AVX512VL: # %bb.0:
1784 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1785 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1786 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1787 ; AVX512VL-NEXT: retq
1789 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1790 ; AVX512BW: # %bb.0:
1791 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm1
1792 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1793 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0
1794 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1795 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1796 ; AVX512BW-NEXT: retq
1798 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1799 ; AVX512VLBW: # %bb.0:
1800 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1801 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1802 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1803 ; AVX512VLBW-NEXT: retq
1805 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
1806 ; AVX512VBMI2: # %bb.0:
1807 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm1
1808 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1809 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm0
1810 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1811 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
1812 ; AVX512VBMI2-NEXT: retq
1814 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
1815 ; AVX512VLVBMI2: # %bb.0:
1816 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1817 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1818 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1819 ; AVX512VLVBMI2-NEXT: retq
1821 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1823 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1824 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1825 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1826 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1827 ; XOPAVX1-NEXT: retq
1829 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1831 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1832 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1833 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1834 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1835 ; XOPAVX2-NEXT: retq
1836 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)