1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
13 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
14 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
15 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
16 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
22 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
23 ; AVX1-LABEL: var_funnnel_v4i64:
25 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
26 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
27 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
28 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5
29 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
30 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4
31 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
32 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
33 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
34 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
36 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
37 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
38 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
39 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
40 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [63,63]
41 ; AVX1-NEXT: # xmm6 = mem[0,0]
42 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
43 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7
44 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
45 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
46 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
47 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
48 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
49 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4
50 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
51 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
52 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
53 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
54 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
57 ; AVX2-LABEL: var_funnnel_v4i64:
59 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
60 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
61 ; AVX2-NEXT: vpsllvq %ymm3, %ymm0, %ymm3
62 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
63 ; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
64 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
65 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
66 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
69 ; AVX512F-LABEL: var_funnnel_v4i64:
71 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
72 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
73 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
74 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
77 ; AVX512VL-LABEL: var_funnnel_v4i64:
79 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
82 ; AVX512BW-LABEL: var_funnnel_v4i64:
84 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
85 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
86 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
87 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
90 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
91 ; AVX512VLBW: # %bb.0:
92 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
93 ; AVX512VLBW-NEXT: retq
95 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
96 ; AVX512VBMI2: # %bb.0:
97 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
98 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
99 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
100 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
101 ; AVX512VBMI2-NEXT: retq
103 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
104 ; AVX512VLVBMI2: # %bb.0:
105 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
106 ; AVX512VLVBMI2-NEXT: retq
108 ; XOPAVX1-LABEL: var_funnnel_v4i64:
110 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
111 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
112 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
113 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
114 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
117 ; XOPAVX2-LABEL: var_funnnel_v4i64:
119 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
120 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
121 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
122 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
123 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
125 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
129 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
130 ; AVX1-LABEL: var_funnnel_v8i32:
132 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
133 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
134 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
135 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
136 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
137 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
138 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
139 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
140 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
141 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
142 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
143 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
144 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
145 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
146 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
147 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
148 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
149 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
150 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
151 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
152 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
153 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
154 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
155 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
156 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
157 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
158 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
159 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
160 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
161 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
162 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
165 ; AVX2-LABEL: var_funnnel_v8i32:
167 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
168 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
169 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
170 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
171 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
172 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
173 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
176 ; AVX512F-LABEL: var_funnnel_v8i32:
178 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
179 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
180 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
181 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
184 ; AVX512VL-LABEL: var_funnnel_v8i32:
186 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
187 ; AVX512VL-NEXT: retq
189 ; AVX512BW-LABEL: var_funnnel_v8i32:
191 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
192 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
193 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
194 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
195 ; AVX512BW-NEXT: retq
197 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
198 ; AVX512VLBW: # %bb.0:
199 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
200 ; AVX512VLBW-NEXT: retq
202 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
203 ; AVX512VBMI2: # %bb.0:
204 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
205 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
206 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
207 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
208 ; AVX512VBMI2-NEXT: retq
210 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
211 ; AVX512VLVBMI2: # %bb.0:
212 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
213 ; AVX512VLVBMI2-NEXT: retq
215 ; XOPAVX1-LABEL: var_funnnel_v8i32:
217 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
218 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
219 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
220 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
221 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
224 ; XOPAVX2-LABEL: var_funnnel_v8i32:
226 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
227 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
228 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
229 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
230 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
232 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
236 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
237 ; AVX1-LABEL: var_funnnel_v16i16:
239 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
240 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
241 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
242 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
243 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
244 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
245 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
246 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
247 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
248 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
249 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
250 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
251 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
252 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
253 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
254 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
255 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
256 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
257 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
258 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
259 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
260 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
261 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
262 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
263 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
264 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
265 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
266 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
267 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
268 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
269 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
272 ; AVX2-LABEL: var_funnnel_v16i16:
274 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
275 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
276 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
277 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
278 ; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
279 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
280 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
281 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
282 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
283 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
284 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
287 ; AVX512F-LABEL: var_funnnel_v16i16:
289 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
290 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
291 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
292 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
293 ; AVX512F-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
294 ; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3
295 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
296 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
297 ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
298 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
299 ; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
302 ; AVX512VL-LABEL: var_funnnel_v16i16:
304 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
305 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
306 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
307 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
308 ; AVX512VL-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
309 ; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3
310 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
311 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
312 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
313 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
314 ; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
315 ; AVX512VL-NEXT: retq
317 ; AVX512BW-LABEL: var_funnnel_v16i16:
319 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
320 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
321 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
322 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
323 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
324 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
325 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
326 ; AVX512BW-NEXT: retq
328 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
329 ; AVX512VLBW: # %bb.0:
330 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
331 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
332 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
333 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
334 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
335 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
336 ; AVX512VLBW-NEXT: retq
338 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
339 ; AVX512VBMI2: # %bb.0:
340 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
341 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
342 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
343 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
344 ; AVX512VBMI2-NEXT: retq
346 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
347 ; AVX512VLVBMI2: # %bb.0:
348 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
349 ; AVX512VLVBMI2-NEXT: retq
351 ; XOPAVX1-LABEL: var_funnnel_v16i16:
353 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
354 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
355 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
356 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
357 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
360 ; XOPAVX2-LABEL: var_funnnel_v16i16:
362 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
363 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
364 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
365 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
366 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
368 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
372 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
373 ; AVX1-LABEL: var_funnnel_v32i8:
375 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
376 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
377 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
378 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
379 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
380 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
381 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
382 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
383 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
384 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
385 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
386 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
387 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
388 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
389 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
390 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
391 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
392 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
393 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
394 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
395 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
396 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
397 ; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
398 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
399 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
400 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
401 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
402 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
403 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
404 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
405 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
406 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
407 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
408 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
409 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
410 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
411 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
412 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
413 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
414 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
415 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
416 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
417 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
418 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
419 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
420 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
423 ; AVX2-LABEL: var_funnnel_v32i8:
425 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
426 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
427 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
428 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
429 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
430 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
431 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
432 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
433 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
434 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
435 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
436 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
437 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
438 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
439 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
440 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
441 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
442 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
443 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
444 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
447 ; AVX512F-LABEL: var_funnnel_v32i8:
449 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
450 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
451 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
452 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
453 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
454 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
455 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
456 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3
457 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
458 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
459 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
460 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
461 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
462 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
463 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
464 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
467 ; AVX512VL-LABEL: var_funnnel_v32i8:
469 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
470 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
471 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
472 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
473 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
474 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
475 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
476 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3
477 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
478 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
479 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
480 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
481 ; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
482 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
483 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
484 ; AVX512VL-NEXT: retq
486 ; AVX512BW-LABEL: var_funnnel_v32i8:
488 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
489 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
490 ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
491 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
492 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
493 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
494 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
495 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
496 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
497 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
498 ; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
499 ; AVX512BW-NEXT: retq
501 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
502 ; AVX512VLBW: # %bb.0:
503 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
504 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
505 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
506 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
507 ; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
508 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3
509 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
510 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
511 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
512 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
513 ; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
514 ; AVX512VLBW-NEXT: retq
516 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
517 ; AVX512VBMI2: # %bb.0:
518 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
519 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
520 ; AVX512VBMI2-NEXT: vpxor %xmm3, %xmm3, %xmm3
521 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
522 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
523 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
524 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
525 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
526 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
527 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
528 ; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
529 ; AVX512VBMI2-NEXT: retq
531 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
532 ; AVX512VLVBMI2: # %bb.0:
533 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
534 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
535 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
536 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
537 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
538 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3
539 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
540 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
541 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
542 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
543 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
544 ; AVX512VLVBMI2-NEXT: retq
546 ; XOPAVX1-LABEL: var_funnnel_v32i8:
548 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
549 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
550 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
551 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
552 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
555 ; XOPAVX2-LABEL: var_funnnel_v32i8:
557 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
558 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
559 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
560 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
561 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
563 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
568 ; Uniform Variable Shifts
571 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
572 ; AVX1-LABEL: splatvar_funnnel_v4i64:
574 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [63,63]
575 ; AVX1-NEXT: # xmm2 = mem[0,0]
576 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
577 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
578 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
579 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
580 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
581 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
582 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
583 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
584 ; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm2
585 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
586 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
587 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
590 ; AVX2-LABEL: splatvar_funnnel_v4i64:
592 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
593 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
594 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3
595 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
596 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
597 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
598 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
599 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
602 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
604 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
605 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
606 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
607 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
610 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
612 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
613 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
614 ; AVX512VL-NEXT: retq
616 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
618 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
619 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
620 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
621 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
622 ; AVX512BW-NEXT: retq
624 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
625 ; AVX512VLBW: # %bb.0:
626 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
627 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
628 ; AVX512VLBW-NEXT: retq
630 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
631 ; AVX512VBMI2: # %bb.0:
632 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
633 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
634 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
635 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
636 ; AVX512VBMI2-NEXT: retq
638 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
639 ; AVX512VLVBMI2: # %bb.0:
640 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
641 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
642 ; AVX512VLVBMI2-NEXT: retq
644 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
646 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
647 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
648 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
649 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
650 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
653 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
655 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
656 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
657 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
658 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
659 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
661 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
662 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
666 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
667 ; AVX1-LABEL: splatvar_funnnel_v8i32:
669 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
670 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
671 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
672 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3
673 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3]
674 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4
675 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
676 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
677 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
678 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
679 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
680 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
681 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
684 ; AVX2-LABEL: splatvar_funnnel_v8i32:
686 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7]
687 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
688 ; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2
689 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
690 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
691 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
694 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
696 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
697 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
698 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
699 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
702 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
704 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
705 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
706 ; AVX512VL-NEXT: retq
708 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
710 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
711 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
712 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
713 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
714 ; AVX512BW-NEXT: retq
716 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
717 ; AVX512VLBW: # %bb.0:
718 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
719 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
720 ; AVX512VLBW-NEXT: retq
722 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
723 ; AVX512VBMI2: # %bb.0:
724 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
725 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
726 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
727 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
728 ; AVX512VBMI2-NEXT: retq
730 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
731 ; AVX512VLVBMI2: # %bb.0:
732 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
733 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
734 ; AVX512VLVBMI2-NEXT: retq
736 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
738 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
739 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
740 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
741 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
742 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
745 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
747 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
748 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
749 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
750 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
751 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
753 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
754 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
758 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
759 ; AVX1-LABEL: splatvar_funnnel_v16i16:
761 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
762 ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
763 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
764 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
765 ; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5
766 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
767 ; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2
768 ; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
769 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
770 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
771 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
772 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
773 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
776 ; AVX2-LABEL: splatvar_funnnel_v16i16:
778 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
779 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3
780 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4
781 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
782 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
783 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
784 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
787 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
789 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
790 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
791 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
792 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
793 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
794 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
795 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
798 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
800 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
801 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
802 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
803 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
804 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
805 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
806 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
807 ; AVX512VL-NEXT: retq
809 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
811 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
812 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
813 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4
814 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
815 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
816 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
817 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
818 ; AVX512BW-NEXT: retq
820 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
821 ; AVX512VLBW: # %bb.0:
822 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
823 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
824 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4
825 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
826 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
827 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
828 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
829 ; AVX512VLBW-NEXT: retq
831 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
832 ; AVX512VBMI2: # %bb.0:
833 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
834 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
835 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
836 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
837 ; AVX512VBMI2-NEXT: retq
839 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
840 ; AVX512VLVBMI2: # %bb.0:
841 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
842 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
843 ; AVX512VLVBMI2-NEXT: retq
845 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
847 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
848 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
849 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
850 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
851 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
852 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
855 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
857 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
858 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
859 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
860 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
861 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
863 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
864 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
868 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
869 ; AVX1-LABEL: splatvar_funnnel_v32i8:
871 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
872 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
873 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
874 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
875 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
876 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
877 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
878 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
879 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
880 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
881 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
882 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
883 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
884 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
885 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
886 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
887 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
890 ; AVX2-LABEL: splatvar_funnnel_v32i8:
892 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
893 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
894 ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
895 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
896 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
897 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
898 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
899 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
902 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
904 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
905 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
906 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
907 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
908 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
909 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
910 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
911 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
914 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
916 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
917 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
918 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
919 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
920 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
921 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
922 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
923 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
924 ; AVX512VL-NEXT: retq
926 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
928 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
929 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
930 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm2, %ymm2
931 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
932 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
933 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
934 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
935 ; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
936 ; AVX512BW-NEXT: retq
938 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
939 ; AVX512VLBW: # %bb.0:
940 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
941 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
942 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm2, %ymm2
943 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm2, %ymm2
944 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
945 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
946 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
947 ; AVX512VLBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
948 ; AVX512VLBW-NEXT: retq
950 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
951 ; AVX512VBMI2: # %bb.0:
952 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
953 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
954 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
955 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
956 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
957 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
958 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
959 ; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
960 ; AVX512VBMI2-NEXT: retq
962 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
963 ; AVX512VLVBMI2: # %bb.0:
964 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
965 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
966 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
967 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
968 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
969 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
970 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
971 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
972 ; AVX512VLVBMI2-NEXT: retq
974 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
976 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
977 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
978 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
979 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
980 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
981 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
984 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
986 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
987 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
988 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
989 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
990 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
992 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
993 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
1001 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
1002 ; AVX1-LABEL: constant_funnnel_v4i64:
1004 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1005 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2
1006 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3
1007 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1008 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
1009 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4
1010 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1011 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1012 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3
1013 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1
1014 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1015 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1016 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1017 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1018 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1019 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
1022 ; AVX2-LABEL: constant_funnnel_v4i64:
1024 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1025 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1026 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1029 ; AVX512F-LABEL: constant_funnnel_v4i64:
1031 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1032 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1033 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1034 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1035 ; AVX512F-NEXT: retq
1037 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1038 ; AVX512VL: # %bb.0:
1039 ; AVX512VL-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1040 ; AVX512VL-NEXT: retq
1042 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1043 ; AVX512BW: # %bb.0:
1044 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1045 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1046 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1047 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1048 ; AVX512BW-NEXT: retq
1050 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1051 ; AVX512VLBW: # %bb.0:
1052 ; AVX512VLBW-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1053 ; AVX512VLBW-NEXT: retq
1055 ; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1056 ; AVX512VBMI2: # %bb.0:
1057 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1058 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1059 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1060 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1061 ; AVX512VBMI2-NEXT: retq
1063 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
1064 ; AVX512VLVBMI2: # %bb.0:
1065 ; AVX512VLVBMI2-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1066 ; AVX512VLVBMI2-NEXT: retq
1068 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1070 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1071 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1072 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1073 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1074 ; XOPAVX1-NEXT: retq
1076 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1078 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1079 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1080 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1081 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1082 ; XOPAVX2-NEXT: retq
1083 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1087 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
1088 ; AVX1-LABEL: constant_funnnel_v8i32:
1090 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1091 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1092 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1093 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1094 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1095 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1096 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1097 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1098 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1099 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1100 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1101 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1102 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1104 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1105 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1106 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1107 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1110 ; AVX2-LABEL: constant_funnnel_v8i32:
1112 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1113 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1114 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1117 ; AVX512F-LABEL: constant_funnnel_v8i32:
1119 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1121 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1122 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1123 ; AVX512F-NEXT: retq
1125 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1126 ; AVX512VL: # %bb.0:
1127 ; AVX512VL-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1128 ; AVX512VL-NEXT: retq
1130 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1131 ; AVX512BW: # %bb.0:
1132 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1133 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1134 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1135 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1136 ; AVX512BW-NEXT: retq
1138 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1139 ; AVX512VLBW: # %bb.0:
1140 ; AVX512VLBW-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1141 ; AVX512VLBW-NEXT: retq
1143 ; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1144 ; AVX512VBMI2: # %bb.0:
1145 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1146 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1147 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1148 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1149 ; AVX512VBMI2-NEXT: retq
1151 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
1152 ; AVX512VLVBMI2: # %bb.0:
1153 ; AVX512VLVBMI2-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1154 ; AVX512VLVBMI2-NEXT: retq
1156 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1158 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1159 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1160 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1161 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1162 ; XOPAVX1-NEXT: retq
1164 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1166 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1167 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1168 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1169 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1170 ; XOPAVX2-NEXT: retq
1171 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1175 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1176 ; AVX1-LABEL: constant_funnnel_v16i16:
1178 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1179 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1180 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1181 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1182 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1183 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1184 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1185 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1186 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1187 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1190 ; AVX2-LABEL: constant_funnnel_v16i16:
1192 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1193 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1194 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1195 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1198 ; AVX512F-LABEL: constant_funnnel_v16i16:
1200 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1201 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1202 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1203 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1204 ; AVX512F-NEXT: retq
1206 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1207 ; AVX512VL: # %bb.0:
1208 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1209 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1210 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1211 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1212 ; AVX512VL-NEXT: retq
1214 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1215 ; AVX512BW: # %bb.0:
1216 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1217 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1218 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1219 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1220 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1221 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1222 ; AVX512BW-NEXT: retq
1224 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1225 ; AVX512VLBW: # %bb.0:
1226 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1227 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1228 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1229 ; AVX512VLBW-NEXT: retq
1231 ; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1232 ; AVX512VBMI2: # %bb.0:
1233 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1234 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1235 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
1236 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1237 ; AVX512VBMI2-NEXT: retq
1239 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
1240 ; AVX512VLVBMI2: # %bb.0:
1241 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1242 ; AVX512VLVBMI2-NEXT: retq
1244 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1246 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1247 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1248 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1249 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1250 ; XOPAVX1-NEXT: retq
1252 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1254 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1255 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1256 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1257 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1258 ; XOPAVX2-NEXT: retq
1259 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1263 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1264 ; AVX1-LABEL: constant_funnnel_v32i8:
1266 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1267 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1268 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
1269 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1270 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1271 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1272 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1273 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1274 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1275 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1276 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1277 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1278 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1279 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1280 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1281 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1282 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1283 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1286 ; AVX2-LABEL: constant_funnnel_v32i8:
1288 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1289 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1290 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1291 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1292 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1293 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1294 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1297 ; AVX512F-LABEL: constant_funnnel_v32i8:
1299 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1300 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1301 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
1302 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1303 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1304 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1305 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1306 ; AVX512F-NEXT: retq
1308 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1309 ; AVX512VL: # %bb.0:
1310 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1311 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1312 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1313 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1314 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1315 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1316 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1317 ; AVX512VL-NEXT: retq
1319 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1320 ; AVX512BW: # %bb.0:
1321 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1322 ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
1323 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1324 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1325 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
1326 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1327 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
1328 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1329 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1330 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1331 ; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1332 ; AVX512BW-NEXT: retq
1334 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1335 ; AVX512VLBW: # %bb.0:
1336 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1337 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1338 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1
1339 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1340 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1341 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
1342 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1343 ; AVX512VLBW-NEXT: retq
1345 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
1346 ; AVX512VBMI2: # %bb.0:
1347 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1348 ; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
1349 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1350 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1351 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1352 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1353 ; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
1354 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1355 ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1356 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1357 ; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1358 ; AVX512VBMI2-NEXT: retq
1360 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
1361 ; AVX512VLVBMI2: # %bb.0:
1362 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1363 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1364 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1365 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1366 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1367 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1368 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1369 ; AVX512VLVBMI2-NEXT: retq
1371 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1373 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1374 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1375 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1376 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1377 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1378 ; XOPAVX1-NEXT: retq
1380 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1382 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1383 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1384 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1385 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1386 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1387 ; XOPAVX2-NEXT: retq
1388 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1393 ; Uniform Constant Shifts
1396 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1397 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
1399 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1
1400 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1401 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3
1402 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1403 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
1404 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2
1405 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1406 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1409 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
1411 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1
1412 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
1413 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1416 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1418 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1419 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1420 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1421 ; AVX512F-NEXT: retq
1423 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1424 ; AVX512VL: # %bb.0:
1425 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1426 ; AVX512VL-NEXT: retq
1428 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1429 ; AVX512BW: # %bb.0:
1430 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1431 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1432 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1433 ; AVX512BW-NEXT: retq
1435 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1436 ; AVX512VLBW: # %bb.0:
1437 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1438 ; AVX512VLBW-NEXT: retq
1440 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
1441 ; AVX512VBMI2: # %bb.0:
1442 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1443 ; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
1444 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1445 ; AVX512VBMI2-NEXT: retq
1447 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
1448 ; AVX512VLVBMI2: # %bb.0:
1449 ; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
1450 ; AVX512VLVBMI2-NEXT: retq
1452 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1454 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1455 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1456 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1457 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1458 ; XOPAVX1-NEXT: retq
1460 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1462 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1463 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1464 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1465 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1466 ; XOPAVX2-NEXT: retq
1467 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1471 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1472 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
1474 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1475 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1476 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1477 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1478 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1479 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1480 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1481 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1484 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
1486 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1487 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1488 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1491 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1493 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1494 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1495 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1496 ; AVX512F-NEXT: retq
1498 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1499 ; AVX512VL: # %bb.0:
1500 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1501 ; AVX512VL-NEXT: retq
1503 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1504 ; AVX512BW: # %bb.0:
1505 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1506 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1507 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1508 ; AVX512BW-NEXT: retq
1510 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1511 ; AVX512VLBW: # %bb.0:
1512 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1513 ; AVX512VLBW-NEXT: retq
1515 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
1516 ; AVX512VBMI2: # %bb.0:
1517 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1518 ; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
1519 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1520 ; AVX512VBMI2-NEXT: retq
1522 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
1523 ; AVX512VLVBMI2: # %bb.0:
1524 ; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
1525 ; AVX512VLVBMI2-NEXT: retq
1527 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1529 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1530 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1531 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1532 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1533 ; XOPAVX1-NEXT: retq
1535 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1537 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1538 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1539 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1540 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1541 ; XOPAVX2-NEXT: retq
1542 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1546 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1547 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
1549 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1550 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1551 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1552 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1553 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1554 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1555 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1556 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1559 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
1561 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1562 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1563 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1566 ; AVX512F-LABEL: splatconstant_funnnel_v16i16:
1568 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1569 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1570 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1571 ; AVX512F-NEXT: retq
1573 ; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
1574 ; AVX512VL: # %bb.0:
1575 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1576 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1577 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1578 ; AVX512VL-NEXT: retq
1580 ; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
1581 ; AVX512BW: # %bb.0:
1582 ; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1
1583 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
1584 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1585 ; AVX512BW-NEXT: retq
1587 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
1588 ; AVX512VLBW: # %bb.0:
1589 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1
1590 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
1591 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1592 ; AVX512VLBW-NEXT: retq
1594 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
1595 ; AVX512VBMI2: # %bb.0:
1596 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1597 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1598 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1599 ; AVX512VBMI2-NEXT: retq
1601 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
1602 ; AVX512VLVBMI2: # %bb.0:
1603 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0
1604 ; AVX512VLVBMI2-NEXT: retq
1606 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1608 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1609 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1610 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1611 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1612 ; XOPAVX1-NEXT: retq
1614 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1616 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1617 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1618 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1619 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1620 ; XOPAVX2-NEXT: retq
1621 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1625 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1626 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
1628 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1629 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1630 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1631 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1632 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1633 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1634 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1635 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1636 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1637 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1638 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1639 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1640 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1643 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
1645 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1646 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1647 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1648 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1649 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1652 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1654 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1655 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1656 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
1657 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1658 ; AVX512F-NEXT: retq
1660 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1661 ; AVX512VL: # %bb.0:
1662 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1663 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1664 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
1665 ; AVX512VL-NEXT: retq
1667 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1668 ; AVX512BW: # %bb.0:
1669 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1670 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1671 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
1672 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1673 ; AVX512BW-NEXT: retq
1675 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1676 ; AVX512VLBW: # %bb.0:
1677 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1678 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1679 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
1680 ; AVX512VLBW-NEXT: retq
1682 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
1683 ; AVX512VBMI2: # %bb.0:
1684 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1685 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1686 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0
1687 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1688 ; AVX512VBMI2-NEXT: retq
1690 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
1691 ; AVX512VLVBMI2: # %bb.0:
1692 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1693 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1694 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
1695 ; AVX512VLVBMI2-NEXT: retq
1697 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1699 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1700 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1701 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1702 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1703 ; XOPAVX1-NEXT: retq
1705 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1707 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1708 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1709 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1710 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1711 ; XOPAVX2-NEXT: retq
1712 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)