1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
13 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
14 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
15 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
16 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
22 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
23 ; AVX1-LABEL: var_funnnel_v4i64:
25 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [63,63,63,63]
26 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm3
27 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
28 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
29 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6
30 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
31 ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
32 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
33 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm6
34 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
35 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
36 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
37 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
38 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
39 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
40 ; AVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4
41 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
42 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm7
43 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
44 ; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm4
45 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7]
46 ; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1
47 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
48 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
49 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
50 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
51 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
52 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
53 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
56 ; AVX2-LABEL: var_funnnel_v4i64:
58 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
59 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
60 ; AVX2-NEXT: vpsllvq %ymm3, %ymm0, %ymm3
61 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
62 ; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
63 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
64 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
65 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
68 ; AVX512F-LABEL: var_funnnel_v4i64:
70 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
71 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
72 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
73 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
76 ; AVX512VL-LABEL: var_funnnel_v4i64:
78 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
81 ; AVX512BW-LABEL: var_funnnel_v4i64:
83 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
84 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
85 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
86 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
89 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
90 ; AVX512VLBW: # %bb.0:
91 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
92 ; AVX512VLBW-NEXT: retq
94 ; AVX512VBMI2-LABEL: var_funnnel_v4i64:
95 ; AVX512VBMI2: # %bb.0:
96 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
97 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
98 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
99 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
100 ; AVX512VBMI2-NEXT: retq
102 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
103 ; AVX512VLVBMI2: # %bb.0:
104 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
105 ; AVX512VLVBMI2-NEXT: retq
107 ; XOPAVX1-LABEL: var_funnnel_v4i64:
109 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
110 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
111 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
112 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
113 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
116 ; XOPAVX2-LABEL: var_funnnel_v4i64:
118 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
119 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
120 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
121 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
122 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
124 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
128 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
129 ; AVX1-LABEL: var_funnnel_v8i32:
131 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
132 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
133 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
134 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
135 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
136 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
137 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
138 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
139 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
140 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
141 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
142 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
143 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
145 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
146 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
147 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
148 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
149 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
150 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
151 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
152 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
153 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
154 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
155 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
156 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
157 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
158 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
159 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
160 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
161 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
164 ; AVX2-LABEL: var_funnnel_v8i32:
166 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
167 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
168 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
169 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
170 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
171 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
172 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
175 ; AVX512F-LABEL: var_funnnel_v8i32:
177 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
178 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
179 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
180 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
183 ; AVX512VL-LABEL: var_funnnel_v8i32:
185 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
186 ; AVX512VL-NEXT: retq
188 ; AVX512BW-LABEL: var_funnnel_v8i32:
190 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
191 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
192 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
193 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
194 ; AVX512BW-NEXT: retq
196 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
197 ; AVX512VLBW: # %bb.0:
198 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
199 ; AVX512VLBW-NEXT: retq
201 ; AVX512VBMI2-LABEL: var_funnnel_v8i32:
202 ; AVX512VBMI2: # %bb.0:
203 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
204 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
205 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
206 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
207 ; AVX512VBMI2-NEXT: retq
209 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
210 ; AVX512VLVBMI2: # %bb.0:
211 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
212 ; AVX512VLVBMI2-NEXT: retq
214 ; XOPAVX1-LABEL: var_funnnel_v8i32:
216 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
217 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
218 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
219 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
220 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
223 ; XOPAVX2-LABEL: var_funnnel_v8i32:
225 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
226 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
227 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
228 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
229 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
231 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
235 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
236 ; AVX1-LABEL: var_funnnel_v16i16:
238 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
239 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
240 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
241 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
242 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
243 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
244 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
245 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
246 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
247 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
248 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
249 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
250 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
251 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
252 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
253 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
254 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
255 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
256 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
257 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
258 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
259 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
260 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
261 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
262 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
263 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
264 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
265 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
266 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
267 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
268 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
271 ; AVX2-LABEL: var_funnnel_v16i16:
273 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
274 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
275 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
276 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
277 ; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
278 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
279 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
280 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
281 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
282 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
283 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
286 ; AVX512F-LABEL: var_funnnel_v16i16:
288 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
289 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
290 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
291 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
292 ; AVX512F-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
293 ; AVX512F-NEXT: vpsrld $16, %ymm3, %ymm3
294 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
295 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
296 ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
297 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
298 ; AVX512F-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
301 ; AVX512VL-LABEL: var_funnnel_v16i16:
303 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
304 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
305 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
306 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
307 ; AVX512VL-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
308 ; AVX512VL-NEXT: vpsrld $16, %ymm3, %ymm3
309 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
310 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
311 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
312 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
313 ; AVX512VL-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
314 ; AVX512VL-NEXT: retq
316 ; AVX512BW-LABEL: var_funnnel_v16i16:
318 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
319 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
320 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
321 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
322 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
323 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
324 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
325 ; AVX512BW-NEXT: retq
327 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
328 ; AVX512VLBW: # %bb.0:
329 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
330 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
331 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
332 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
333 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
334 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
335 ; AVX512VLBW-NEXT: retq
337 ; AVX512VBMI2-LABEL: var_funnnel_v16i16:
338 ; AVX512VBMI2: # %bb.0:
339 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
340 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
341 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
342 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
343 ; AVX512VBMI2-NEXT: retq
345 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
346 ; AVX512VLVBMI2: # %bb.0:
347 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
348 ; AVX512VLVBMI2-NEXT: retq
350 ; XOPAVX1-LABEL: var_funnnel_v16i16:
352 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
353 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
354 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
355 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
356 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
359 ; XOPAVX2-LABEL: var_funnnel_v16i16:
361 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
362 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
363 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
364 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
365 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
367 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
371 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
372 ; AVX1-LABEL: var_funnnel_v32i8:
374 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
375 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
376 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
377 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
378 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
379 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
380 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
381 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
382 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
383 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
384 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
385 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
386 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
387 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
388 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
389 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
390 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
391 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
392 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
393 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
394 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
395 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm8
396 ; AVX1-NEXT: vpor %xmm3, %xmm8, %xmm3
397 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
398 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
399 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
400 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
401 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
402 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
403 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
404 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
405 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
406 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
407 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
408 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
409 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
410 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
411 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
412 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
413 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
414 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
415 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
416 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
417 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
418 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
419 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
422 ; AVX2-LABEL: var_funnnel_v32i8:
424 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
425 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
426 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
427 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
428 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
429 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
430 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
431 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
432 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
433 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
434 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
435 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
436 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
437 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
438 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm2
439 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
440 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3
441 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
442 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
443 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
446 ; AVX512F-LABEL: var_funnnel_v32i8:
448 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
449 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
450 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
451 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
452 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
453 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
454 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
455 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3
456 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
457 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
458 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
459 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
460 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
461 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
462 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
463 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
466 ; AVX512VL-LABEL: var_funnnel_v32i8:
468 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
469 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
470 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
471 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
472 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
473 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
474 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
475 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
476 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
477 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
478 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
479 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
480 ; AVX512VL-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
481 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
482 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
483 ; AVX512VL-NEXT: retq
485 ; AVX512BW-LABEL: var_funnnel_v32i8:
487 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
488 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
489 ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
490 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
491 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
492 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
493 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
494 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
495 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
496 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
497 ; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
498 ; AVX512BW-NEXT: retq
500 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
501 ; AVX512VLBW: # %bb.0:
502 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
503 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
504 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
505 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
506 ; AVX512VLBW-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
507 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm3, %ymm3
508 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
509 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
510 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
511 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
512 ; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
513 ; AVX512VLBW-NEXT: retq
515 ; AVX512VBMI2-LABEL: var_funnnel_v32i8:
516 ; AVX512VBMI2: # %bb.0:
517 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
518 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
519 ; AVX512VBMI2-NEXT: vpxor %xmm3, %xmm3, %xmm3
520 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
521 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm2, %zmm2
522 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
523 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
524 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
525 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
526 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
527 ; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
528 ; AVX512VBMI2-NEXT: retq
530 ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
531 ; AVX512VLVBMI2: # %bb.0:
532 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
533 ; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
534 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
535 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
536 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
537 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm3, %ymm3
538 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
539 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
540 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
541 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
542 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
543 ; AVX512VLVBMI2-NEXT: retq
545 ; XOPAVX1-LABEL: var_funnnel_v32i8:
547 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
548 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
549 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
550 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
551 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
554 ; XOPAVX2-LABEL: var_funnnel_v32i8:
556 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
557 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
558 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
559 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
560 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
562 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
567 ; Uniform Variable Shifts
570 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
571 ; AVX1-LABEL: splatvar_funnnel_v4i64:
573 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63]
574 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
576 ; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
577 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
578 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
579 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
580 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
581 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
582 ; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm2
583 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
584 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
585 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
588 ; AVX2-LABEL: splatvar_funnnel_v4i64:
590 ; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [63,63]
591 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
592 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3
593 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
594 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
595 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
596 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
597 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
600 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
602 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
603 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
604 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
605 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
608 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
610 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
611 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
612 ; AVX512VL-NEXT: retq
614 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
616 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
617 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
618 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
619 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
620 ; AVX512BW-NEXT: retq
622 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
623 ; AVX512VLBW: # %bb.0:
624 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
625 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
626 ; AVX512VLBW-NEXT: retq
628 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
629 ; AVX512VBMI2: # %bb.0:
630 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
631 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
632 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
633 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
634 ; AVX512VBMI2-NEXT: retq
636 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
637 ; AVX512VLVBMI2: # %bb.0:
638 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
639 ; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
640 ; AVX512VLVBMI2-NEXT: retq
642 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
644 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
645 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
646 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
647 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
648 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
651 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
653 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
654 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
655 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm2, %xmm2
656 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
657 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
659 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
660 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
664 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
665 ; AVX1-LABEL: splatvar_funnnel_v8i32:
667 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
668 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
669 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
670 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm3
671 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3]
672 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm4
673 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
674 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
675 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
676 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
677 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
678 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
679 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
682 ; AVX2-LABEL: splatvar_funnnel_v8i32:
684 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7]
685 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
686 ; AVX2-NEXT: vpsllq %xmm1, %ymm2, %ymm2
687 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
688 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
689 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
692 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
694 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
695 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
696 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
697 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
700 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
702 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
703 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
704 ; AVX512VL-NEXT: retq
706 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
708 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
709 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
710 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
711 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
712 ; AVX512BW-NEXT: retq
714 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
715 ; AVX512VLBW: # %bb.0:
716 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
717 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
718 ; AVX512VLBW-NEXT: retq
720 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
721 ; AVX512VBMI2: # %bb.0:
722 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
723 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
724 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
725 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
726 ; AVX512VBMI2-NEXT: retq
728 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
729 ; AVX512VLVBMI2: # %bb.0:
730 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
731 ; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
732 ; AVX512VLVBMI2-NEXT: retq
734 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
736 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
737 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
738 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
739 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
740 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
743 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
745 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
746 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
747 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm2, %xmm2
748 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
749 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
751 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
752 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
756 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
757 ; AVX1-LABEL: splatvar_funnnel_v16i16:
759 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [15,0]
760 ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
761 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
762 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
763 ; AVX1-NEXT: vpsrlw %xmm3, %xmm5, %xmm5
764 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
765 ; AVX1-NEXT: vpsllw %xmm1, %xmm4, %xmm2
766 ; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
767 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4
768 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
769 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
770 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
771 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
774 ; AVX2-LABEL: splatvar_funnnel_v16i16:
776 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
777 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm3
778 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm4
779 ; AVX2-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
780 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
781 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
782 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
785 ; AVX512F-LABEL: splatvar_funnnel_v16i16:
787 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
788 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
789 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4
790 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
791 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
792 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
793 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
796 ; AVX512VL-LABEL: splatvar_funnnel_v16i16:
798 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
799 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
800 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
801 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
802 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
803 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
804 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
805 ; AVX512VL-NEXT: retq
807 ; AVX512BW-LABEL: splatvar_funnnel_v16i16:
809 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
810 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
811 ; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm4
812 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
813 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
814 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
815 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
816 ; AVX512BW-NEXT: retq
818 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
819 ; AVX512VLBW: # %bb.0:
820 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
821 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
822 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm0, %ymm4
823 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm4, %ymm3
824 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
825 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
826 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
827 ; AVX512VLBW-NEXT: retq
829 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
830 ; AVX512VBMI2: # %bb.0:
831 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
832 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
833 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
834 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
835 ; AVX512VBMI2-NEXT: retq
837 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
838 ; AVX512VLVBMI2: # %bb.0:
839 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %ymm1
840 ; AVX512VLVBMI2-NEXT: vpshldvw %ymm1, %ymm0, %ymm0
841 ; AVX512VLVBMI2-NEXT: retq
843 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
845 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
846 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
847 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
848 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
849 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
850 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
853 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
855 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
856 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
857 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm2, %xmm2
858 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
859 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
861 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
862 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
866 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
867 ; AVX1-LABEL: splatvar_funnnel_v32i8:
869 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
870 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
871 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
872 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
873 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
874 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
875 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
876 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
877 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
878 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
879 ; AVX1-NEXT: vpsllw %xmm1, %xmm3, %xmm3
880 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
881 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
882 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
883 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
884 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
885 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
888 ; AVX2-LABEL: splatvar_funnnel_v32i8:
890 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
891 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
892 ; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
893 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
894 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
895 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
896 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
897 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
900 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
902 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
903 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
904 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
905 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
906 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
907 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
908 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
909 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
912 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
914 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
915 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
916 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
917 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
918 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
919 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
920 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
921 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
922 ; AVX512VL-NEXT: retq
924 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
926 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
927 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
928 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm2, %ymm2
929 ; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
930 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
931 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
932 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
933 ; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
934 ; AVX512BW-NEXT: retq
936 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
937 ; AVX512VLBW: # %bb.0:
938 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
939 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
940 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm2, %ymm2
941 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm2, %ymm2
942 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
943 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
944 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
945 ; AVX512VLBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
946 ; AVX512VLBW-NEXT: retq
948 ; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
949 ; AVX512VBMI2: # %bb.0:
950 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
951 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
952 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
953 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
954 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
955 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
956 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
957 ; AVX512VBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
958 ; AVX512VBMI2-NEXT: retq
960 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
961 ; AVX512VLVBMI2: # %bb.0:
962 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
963 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
964 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm2, %ymm2
965 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm2, %ymm2
966 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
967 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
968 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
969 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
970 ; AVX512VLVBMI2-NEXT: retq
972 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
974 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
975 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
976 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
977 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
978 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
979 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
982 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
984 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
985 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
986 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm2, %xmm2
987 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
988 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
990 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
991 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
999 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
1000 ; AVX1-LABEL: constant_funnnel_v4i64:
1002 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1003 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2
1004 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3
1005 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1006 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
1007 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4
1008 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1009 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1010 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3
1011 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1
1012 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1013 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
1014 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1015 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1016 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1017 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
1020 ; AVX2-LABEL: constant_funnnel_v4i64:
1022 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1023 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1024 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1027 ; AVX512F-LABEL: constant_funnnel_v4i64:
1029 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1030 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60]
1031 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1032 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1033 ; AVX512F-NEXT: retq
1035 ; AVX512VL-LABEL: constant_funnnel_v4i64:
1036 ; AVX512VL: # %bb.0:
1037 ; AVX512VL-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1038 ; AVX512VL-NEXT: retq
1040 ; AVX512BW-LABEL: constant_funnnel_v4i64:
1041 ; AVX512BW: # %bb.0:
1042 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1043 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60]
1044 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1045 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1046 ; AVX512BW-NEXT: retq
1048 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1049 ; AVX512VLBW: # %bb.0:
1050 ; AVX512VLBW-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1051 ; AVX512VLBW-NEXT: retq
1053 ; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1054 ; AVX512VBMI2: # %bb.0:
1055 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1056 ; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60]
1057 ; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
1058 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1059 ; AVX512VBMI2-NEXT: retq
1061 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
1062 ; AVX512VLVBMI2: # %bb.0:
1063 ; AVX512VLVBMI2-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1064 ; AVX512VLVBMI2-NEXT: retq
1066 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
1068 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1069 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1070 ; XOPAVX1-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1071 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1072 ; XOPAVX1-NEXT: retq
1074 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
1076 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1077 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1078 ; XOPAVX2-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1079 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1080 ; XOPAVX2-NEXT: retq
1081 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1085 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
1086 ; AVX1-LABEL: constant_funnnel_v8i32:
1088 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1089 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1090 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1091 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1092 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1093 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1094 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1095 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1096 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1097 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1098 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1099 ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1100 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1101 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1102 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1104 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1105 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1108 ; AVX2-LABEL: constant_funnnel_v8i32:
1110 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1111 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1112 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1115 ; AVX512F-LABEL: constant_funnnel_v8i32:
1117 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1118 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1119 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1120 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1121 ; AVX512F-NEXT: retq
1123 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1124 ; AVX512VL: # %bb.0:
1125 ; AVX512VL-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1126 ; AVX512VL-NEXT: retq
1128 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1129 ; AVX512BW: # %bb.0:
1130 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1131 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1132 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1133 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1134 ; AVX512BW-NEXT: retq
1136 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1137 ; AVX512VLBW: # %bb.0:
1138 ; AVX512VLBW-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1139 ; AVX512VLBW-NEXT: retq
1141 ; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1142 ; AVX512VBMI2: # %bb.0:
1143 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1144 ; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1145 ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1146 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1147 ; AVX512VBMI2-NEXT: retq
1149 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
1150 ; AVX512VLVBMI2: # %bb.0:
1151 ; AVX512VLVBMI2-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1152 ; AVX512VLVBMI2-NEXT: retq
1154 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1156 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1157 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1158 ; XOPAVX1-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1159 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1160 ; XOPAVX1-NEXT: retq
1162 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1164 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1165 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1166 ; XOPAVX2-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1167 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1168 ; XOPAVX2-NEXT: retq
1169 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1173 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1174 ; AVX1-LABEL: constant_funnnel_v16i16:
1176 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1177 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1178 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1179 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1180 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1181 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1182 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1183 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1184 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1185 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1188 ; AVX2-LABEL: constant_funnnel_v16i16:
1190 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1191 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1192 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1193 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1196 ; AVX512F-LABEL: constant_funnnel_v16i16:
1198 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1199 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1200 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1201 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1202 ; AVX512F-NEXT: retq
1204 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1205 ; AVX512VL: # %bb.0:
1206 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1207 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1208 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1209 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1210 ; AVX512VL-NEXT: retq
1212 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1213 ; AVX512BW: # %bb.0:
1214 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1215 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1216 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1217 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1218 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1219 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1220 ; AVX512BW-NEXT: retq
1222 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1223 ; AVX512VLBW: # %bb.0:
1224 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1225 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1226 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1227 ; AVX512VLBW-NEXT: retq
1229 ; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1230 ; AVX512VBMI2: # %bb.0:
1231 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1232 ; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1233 ; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
1234 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1235 ; AVX512VBMI2-NEXT: retq
1237 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
1238 ; AVX512VLVBMI2: # %bb.0:
1239 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1240 ; AVX512VLVBMI2-NEXT: retq
1242 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1244 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1245 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1246 ; XOPAVX1-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1247 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1248 ; XOPAVX1-NEXT: retq
1250 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1252 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1253 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1254 ; XOPAVX2-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1255 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1256 ; XOPAVX2-NEXT: retq
1257 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1261 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1262 ; AVX1-LABEL: constant_funnnel_v32i8:
1264 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1265 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1266 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
1267 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1268 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1269 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1270 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1271 ; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
1272 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
1273 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1274 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1275 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
1276 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1277 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1278 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
1279 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1280 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1281 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1284 ; AVX2-LABEL: constant_funnnel_v32i8:
1286 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1287 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1288 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1289 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1290 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1291 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1292 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1295 ; AVX512F-LABEL: constant_funnnel_v32i8:
1297 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1298 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1299 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
1300 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1301 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1302 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1303 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1304 ; AVX512F-NEXT: retq
1306 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1307 ; AVX512VL: # %bb.0:
1308 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1309 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1310 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
1311 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1312 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1313 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1314 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1315 ; AVX512VL-NEXT: retq
1317 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1318 ; AVX512BW: # %bb.0:
1319 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1320 ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
1321 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1322 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1323 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
1324 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1325 ; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
1326 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1327 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1328 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
1329 ; AVX512BW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1330 ; AVX512BW-NEXT: retq
1332 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1333 ; AVX512VLBW: # %bb.0:
1334 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1335 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1336 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1
1337 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1338 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1339 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
1340 ; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1341 ; AVX512VLBW-NEXT: retq
1343 ; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
1344 ; AVX512VBMI2: # %bb.0:
1345 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1346 ; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1]
1347 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1348 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1
1349 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1350 ; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1351 ; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1]
1352 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1353 ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1354 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1355 ; AVX512VBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1356 ; AVX512VBMI2-NEXT: retq
1358 ; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
1359 ; AVX512VLVBMI2: # %bb.0:
1360 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1361 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1362 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1
1363 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1364 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1365 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
1366 ; AVX512VLVBMI2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1367 ; AVX512VLVBMI2-NEXT: retq
1369 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1371 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1372 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1373 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1374 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1375 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1376 ; XOPAVX1-NEXT: retq
1378 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1380 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1381 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1382 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1383 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1384 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1385 ; XOPAVX2-NEXT: retq
1386 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1391 ; Uniform Constant Shifts
1394 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1395 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
1397 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1
1398 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1399 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3
1400 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1401 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
1402 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2
1403 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1404 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1407 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
1409 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1
1410 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
1411 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1414 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1416 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1417 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1418 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1419 ; AVX512F-NEXT: retq
1421 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1422 ; AVX512VL: # %bb.0:
1423 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1424 ; AVX512VL-NEXT: retq
1426 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1427 ; AVX512BW: # %bb.0:
1428 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1429 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1430 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1431 ; AVX512BW-NEXT: retq
1433 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1434 ; AVX512VLBW: # %bb.0:
1435 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1436 ; AVX512VLBW-NEXT: retq
1438 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
1439 ; AVX512VBMI2: # %bb.0:
1440 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1441 ; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
1442 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1443 ; AVX512VBMI2-NEXT: retq
1445 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
1446 ; AVX512VLVBMI2: # %bb.0:
1447 ; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
1448 ; AVX512VLVBMI2-NEXT: retq
1450 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1452 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1453 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1454 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1455 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1456 ; XOPAVX1-NEXT: retq
1458 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1460 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1461 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1462 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1463 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1464 ; XOPAVX2-NEXT: retq
1465 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1469 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1470 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
1472 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1473 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1474 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1475 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1476 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1477 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1478 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1479 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1482 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
1484 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1485 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1486 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1489 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1491 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1492 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1493 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1494 ; AVX512F-NEXT: retq
1496 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1497 ; AVX512VL: # %bb.0:
1498 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1499 ; AVX512VL-NEXT: retq
1501 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1502 ; AVX512BW: # %bb.0:
1503 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1504 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1505 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1506 ; AVX512BW-NEXT: retq
1508 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1509 ; AVX512VLBW: # %bb.0:
1510 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1511 ; AVX512VLBW-NEXT: retq
1513 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
1514 ; AVX512VBMI2: # %bb.0:
1515 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1516 ; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
1517 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1518 ; AVX512VBMI2-NEXT: retq
1520 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
1521 ; AVX512VLVBMI2: # %bb.0:
1522 ; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
1523 ; AVX512VLVBMI2-NEXT: retq
1525 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1527 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1528 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1529 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1530 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1531 ; XOPAVX1-NEXT: retq
1533 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1535 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1536 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1537 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1538 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1539 ; XOPAVX2-NEXT: retq
1540 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1544 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1545 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
1547 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1548 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1549 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1550 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1551 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1552 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1553 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1554 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1557 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
1559 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1560 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1561 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1564 ; AVX512F-LABEL: splatconstant_funnnel_v16i16:
1566 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm1
1567 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
1568 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1569 ; AVX512F-NEXT: retq
1571 ; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
1572 ; AVX512VL: # %bb.0:
1573 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm1
1574 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
1575 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
1576 ; AVX512VL-NEXT: retq
1578 ; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
1579 ; AVX512BW: # %bb.0:
1580 ; AVX512BW-NEXT: vpsrlw $9, %ymm0, %ymm1
1581 ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
1582 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1583 ; AVX512BW-NEXT: retq
1585 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
1586 ; AVX512VLBW: # %bb.0:
1587 ; AVX512VLBW-NEXT: vpsrlw $9, %ymm0, %ymm1
1588 ; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0
1589 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1590 ; AVX512VLBW-NEXT: retq
1592 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
1593 ; AVX512VBMI2: # %bb.0:
1594 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1595 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
1596 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1597 ; AVX512VBMI2-NEXT: retq
1599 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
1600 ; AVX512VLVBMI2: # %bb.0:
1601 ; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm0, %ymm0, %ymm0
1602 ; AVX512VLVBMI2-NEXT: retq
1604 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1606 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1607 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1608 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1609 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1610 ; XOPAVX1-NEXT: retq
1612 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1614 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1615 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1616 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1617 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1618 ; XOPAVX2-NEXT: retq
1619 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1623 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1624 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
1626 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1627 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1628 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1629 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1630 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1631 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1632 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1633 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1634 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1635 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1636 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1637 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1638 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1641 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
1643 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1644 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1645 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1646 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1647 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1650 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1652 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1653 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1654 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
1655 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1656 ; AVX512F-NEXT: retq
1658 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1659 ; AVX512VL: # %bb.0:
1660 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1661 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1662 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1663 ; AVX512VL-NEXT: retq
1665 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1666 ; AVX512BW: # %bb.0:
1667 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1
1668 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0
1669 ; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
1670 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1671 ; AVX512BW-NEXT: retq
1673 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1674 ; AVX512VLBW: # %bb.0:
1675 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1676 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1677 ; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1678 ; AVX512VLBW-NEXT: retq
1680 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
1681 ; AVX512VBMI2: # %bb.0:
1682 ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1683 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1684 ; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
1685 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1686 ; AVX512VBMI2-NEXT: retq
1688 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
1689 ; AVX512VLVBMI2: # %bb.0:
1690 ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
1691 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
1692 ; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
1693 ; AVX512VLVBMI2-NEXT: retq
1695 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1697 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1698 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1699 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1700 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1701 ; XOPAVX1-NEXT: retq
1703 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1705 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1706 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1707 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1708 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1709 ; XOPAVX2-NEXT: retq
1710 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)