1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
11 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
12 declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
13 declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
14 declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
20 define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
21 ; AVX1-LABEL: var_funnnel_v4i64:
23 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
24 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm3
25 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
26 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm5
27 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
28 ; AVX1-NEXT: vpsllq %xmm4, %xmm2, %xmm4
29 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
30 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm5
31 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
32 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
33 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
34 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
35 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
36 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
37 ; AVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
38 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63]
39 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
40 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm7
41 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
42 ; AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm2
43 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
44 ; AVX1-NEXT: vpsubq %xmm1, %xmm5, %xmm1
45 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
46 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm4
47 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
48 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
50 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
51 ; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0
54 ; AVX2-LABEL: var_funnnel_v4i64:
56 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
57 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
58 ; AVX2-NEXT: vpsllvq %ymm3, %ymm0, %ymm3
59 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
60 ; AVX2-NEXT: vpsubq %ymm1, %ymm4, %ymm1
61 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
62 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
63 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
66 ; AVX512F-LABEL: var_funnnel_v4i64:
68 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
69 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
70 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
71 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
74 ; AVX512VL-LABEL: var_funnnel_v4i64:
76 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
79 ; AVX512BW-LABEL: var_funnnel_v4i64:
81 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
82 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
83 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
84 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
87 ; AVX512VLBW-LABEL: var_funnnel_v4i64:
88 ; AVX512VLBW: # %bb.0:
89 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
90 ; AVX512VLBW-NEXT: retq
92 ; XOPAVX1-LABEL: var_funnnel_v4i64:
94 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
95 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
96 ; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
97 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
98 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
101 ; XOPAVX2-LABEL: var_funnnel_v4i64:
103 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
104 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
105 ; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
106 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
107 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
109 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
113 define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
114 ; AVX1-LABEL: var_funnnel_v8i32:
116 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
117 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
118 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
119 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
120 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
121 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
122 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
123 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
124 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
125 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
126 ; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
127 ; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2
128 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
129 ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
130 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
131 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
132 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
133 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
134 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
135 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
136 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
137 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
138 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
139 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
140 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
141 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
142 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
143 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
144 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
145 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
146 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
149 ; AVX2-LABEL: var_funnnel_v8i32:
151 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
152 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
153 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2
154 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
155 ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
156 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
157 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
160 ; AVX512F-LABEL: var_funnnel_v8i32:
162 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
163 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
164 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
165 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
168 ; AVX512VL-LABEL: var_funnnel_v8i32:
170 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
171 ; AVX512VL-NEXT: retq
173 ; AVX512BW-LABEL: var_funnnel_v8i32:
175 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
176 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
177 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
178 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
179 ; AVX512BW-NEXT: retq
181 ; AVX512VLBW-LABEL: var_funnnel_v8i32:
182 ; AVX512VLBW: # %bb.0:
183 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
184 ; AVX512VLBW-NEXT: retq
186 ; XOPAVX1-LABEL: var_funnnel_v8i32:
188 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
189 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
190 ; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
191 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
192 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
195 ; XOPAVX2-LABEL: var_funnnel_v8i32:
197 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
198 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
199 ; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
200 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
201 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
203 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
207 define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
208 ; AVX1-LABEL: var_funnnel_v16i16:
210 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
211 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
212 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
213 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
214 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
216 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
217 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
218 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
219 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
220 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
221 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
222 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
223 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
224 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
225 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
226 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
227 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
228 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
229 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
230 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
231 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
232 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
233 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
234 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
235 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
236 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
237 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
238 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
239 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
240 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
243 ; AVX2-LABEL: var_funnnel_v16i16:
245 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
246 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
247 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
248 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
249 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4
250 ; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
251 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
252 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
253 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5
254 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5
255 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4
256 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
257 ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1
258 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
259 ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3
260 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
261 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
262 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
263 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
264 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
265 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
268 ; AVX512F-LABEL: var_funnnel_v16i16:
270 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
271 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
272 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
273 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
274 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
275 ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1
276 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
277 ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
278 ; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
279 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
282 ; AVX512VL-LABEL: var_funnnel_v16i16:
284 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
285 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
286 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
287 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
288 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
289 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1
290 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
291 ; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
292 ; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0
293 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
294 ; AVX512VL-NEXT: retq
296 ; AVX512BW-LABEL: var_funnnel_v16i16:
298 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
299 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
300 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
301 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
302 ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
303 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
304 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
305 ; AVX512BW-NEXT: retq
307 ; AVX512VLBW-LABEL: var_funnnel_v16i16:
308 ; AVX512VLBW: # %bb.0:
309 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
310 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
311 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
312 ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1
313 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
314 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
315 ; AVX512VLBW-NEXT: retq
317 ; XOPAVX1-LABEL: var_funnnel_v16i16:
319 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
320 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
321 ; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
322 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
323 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
326 ; XOPAVX2-LABEL: var_funnnel_v16i16:
328 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
329 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
330 ; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
331 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
332 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
334 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
338 define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
339 ; AVX1-LABEL: var_funnnel_v32i8:
341 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
342 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
343 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
344 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
345 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5
346 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm5
347 ; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
348 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
349 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
350 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
351 ; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3
352 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
353 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
354 ; AVX1-NEXT: vpsllw $2, %xmm2, %xmm7
355 ; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7
356 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
357 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
358 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
359 ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3
360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
361 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
362 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7
363 ; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3
364 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
365 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
366 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
367 ; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
368 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm5
369 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
370 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
371 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
372 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
373 ; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3
374 ; AVX1-NEXT: vpandn %xmm3, %xmm6, %xmm3
375 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4
376 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
377 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
378 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
379 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
380 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3
381 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
382 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4
383 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
384 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
385 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
386 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
389 ; AVX2-LABEL: var_funnnel_v32i8:
391 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
392 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
393 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3
394 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
395 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
396 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
397 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
398 ; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2
399 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
400 ; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3
401 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
402 ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
403 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
404 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
405 ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
406 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3
407 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
408 ; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
409 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
410 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
413 ; AVX512F-LABEL: var_funnnel_v32i8:
415 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
416 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
417 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
418 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
419 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
420 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
421 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
422 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2
423 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
424 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
425 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
426 ; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
427 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
428 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
429 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2
430 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
431 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
432 ; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
433 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
434 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
437 ; AVX512VL-LABEL: var_funnnel_v32i8:
439 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
440 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
441 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
442 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
443 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
444 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
445 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
446 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
447 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
448 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
449 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
450 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
451 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
452 ; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
453 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
454 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455 ; AVX512VL-NEXT: retq
457 ; AVX512BW-LABEL: var_funnnel_v32i8:
459 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
460 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3
461 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
462 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
463 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
464 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
465 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
466 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
467 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
468 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
469 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
470 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
471 ; AVX512BW-NEXT: retq
473 ; AVX512VLBW-LABEL: var_funnnel_v32i8:
474 ; AVX512VLBW: # %bb.0:
475 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
476 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3
477 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
478 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
479 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
480 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
481 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
482 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
483 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
484 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
485 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
486 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
487 ; AVX512VLBW-NEXT: retq
489 ; XOPAVX1-LABEL: var_funnnel_v32i8:
491 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
492 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
493 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
494 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
495 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
498 ; XOPAVX2-LABEL: var_funnnel_v32i8:
500 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
501 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
502 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
503 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
504 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
506 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
511 ; Uniform Variable Shifts
514 define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
515 ; AVX1-LABEL: splatvar_funnnel_v4i64:
517 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
518 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
519 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
520 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
521 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
522 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
523 ; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5
524 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
525 ; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7
526 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
527 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2
528 ; AVX1-NEXT: vpsrlq %xmm6, %xmm0, %xmm6
529 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
530 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
531 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
532 ; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm3
533 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
534 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
535 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
538 ; AVX2-LABEL: splatvar_funnnel_v4i64:
540 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
541 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
542 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
543 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3
544 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
545 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
546 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
547 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
548 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
551 ; AVX512F-LABEL: splatvar_funnnel_v4i64:
553 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
554 ; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1
555 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
556 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
559 ; AVX512VL-LABEL: splatvar_funnnel_v4i64:
561 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
562 ; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
563 ; AVX512VL-NEXT: retq
565 ; AVX512BW-LABEL: splatvar_funnnel_v4i64:
567 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
568 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1
569 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
570 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
571 ; AVX512BW-NEXT: retq
573 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
574 ; AVX512VLBW: # %bb.0:
575 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1
576 ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
577 ; AVX512VLBW-NEXT: retq
579 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
581 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
582 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
583 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2
584 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
585 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
588 ; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
590 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1
591 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
592 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
593 ; XOPAVX2-NEXT: vprotq %xmm3, %xmm2, %xmm2
594 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
595 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
597 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
598 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
602 define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
603 ; AVX1-LABEL: splatvar_funnnel_v8i32:
605 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
606 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
607 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
608 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
609 ; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4
610 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
611 ; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1
612 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
613 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
614 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
615 ; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3
616 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
617 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
618 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
621 ; AVX2-LABEL: splatvar_funnnel_v8i32:
623 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
624 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
625 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
626 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
627 ; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2
628 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
629 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
630 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
631 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
632 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
635 ; AVX512F-LABEL: splatvar_funnnel_v8i32:
637 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
638 ; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1
639 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
640 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
643 ; AVX512VL-LABEL: splatvar_funnnel_v8i32:
645 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1
646 ; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
647 ; AVX512VL-NEXT: retq
649 ; AVX512BW-LABEL: splatvar_funnnel_v8i32:
651 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
652 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1
653 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
654 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
655 ; AVX512BW-NEXT: retq
657 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
658 ; AVX512VLBW: # %bb.0:
659 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1
660 ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
661 ; AVX512VLBW-NEXT: retq
663 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
665 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
666 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
667 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2
668 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
669 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
672 ; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
674 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1
675 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
676 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
677 ; XOPAVX2-NEXT: vprotd %xmm3, %xmm2, %xmm2
678 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
679 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
681 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
682 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
686 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
687 ; AVX1-LABEL: splatvar_funnnel_v16i16:
689 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
690 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
691 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
692 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
693 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
694 ; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4
695 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
696 ; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1
697 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
698 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
699 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
700 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
701 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
702 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
703 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
706 ; AVX2-LABEL: splatvar_funnnel_v16i16:
708 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
709 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
710 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
711 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2
712 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
713 ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
714 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
715 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
716 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
719 ; AVX512-LABEL: splatvar_funnnel_v16i16:
721 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1
722 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
723 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
724 ; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2
725 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
726 ; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1
727 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
728 ; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
729 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
732 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
734 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
735 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
736 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
737 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2
738 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
739 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
742 ; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
744 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1
745 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
746 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
747 ; XOPAVX2-NEXT: vprotw %xmm3, %xmm2, %xmm2
748 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
749 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
751 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
752 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
756 define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
757 ; AVX1-LABEL: splatvar_funnnel_v32i8:
759 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
760 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
761 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
762 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
763 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
764 ; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5
765 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
766 ; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7
767 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
768 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
769 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
770 ; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1
771 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
772 ; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4
773 ; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6
774 ; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
775 ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
776 ; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
777 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3
778 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
779 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
780 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
781 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
782 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
785 ; AVX2-LABEL: splatvar_funnnel_v32i8:
787 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
788 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
789 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
790 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
791 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
792 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
793 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
794 ; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
795 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
796 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
797 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
798 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
799 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
800 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
801 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
802 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
803 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
806 ; AVX512F-LABEL: splatvar_funnnel_v32i8:
808 ; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
809 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
810 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
811 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
812 ; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
813 ; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
814 ; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
815 ; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
816 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
817 ; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
818 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
819 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
820 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
821 ; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
822 ; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
823 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
824 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
827 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
829 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
830 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
831 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
832 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
833 ; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
834 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
835 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
836 ; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
837 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
838 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
839 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
840 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
841 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
842 ; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
843 ; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
844 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
845 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
846 ; AVX512VL-NEXT: retq
848 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
850 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
851 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
852 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm3
853 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
854 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
855 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
856 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
857 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
858 ; AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
859 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
860 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
861 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
862 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
863 ; AVX512BW-NEXT: retq
865 ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
866 ; AVX512VLBW: # %bb.0:
867 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1
868 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
869 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm3
870 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
871 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
872 ; AVX512VLBW-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
873 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
874 ; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm4, %ymm1
875 ; AVX512VLBW-NEXT: vpand %ymm2, %ymm1, %ymm1
876 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
877 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
878 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
879 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
880 ; AVX512VLBW-NEXT: retq
882 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
884 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
885 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
886 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
887 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2
888 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
889 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
892 ; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
894 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
895 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
896 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
897 ; XOPAVX2-NEXT: vprotb %xmm3, %xmm2, %xmm2
898 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
899 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
901 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
902 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
910 define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
911 ; AVX1-LABEL: constant_funnnel_v4i64:
913 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
914 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm2
915 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm3
916 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
917 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3
918 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm4
919 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
920 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
921 ; AVX1-NEXT: vpsllq $60, %xmm1, %xmm3
922 ; AVX1-NEXT: vpsllq $50, %xmm1, %xmm1
923 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
924 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3
925 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
926 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
927 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
928 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
931 ; AVX2-LABEL: constant_funnnel_v4i64:
933 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
934 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
935 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
938 ; AVX512F-LABEL: constant_funnnel_v4i64:
940 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
941 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
942 ; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0
943 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
946 ; AVX512VL-LABEL: constant_funnnel_v4i64:
948 ; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
949 ; AVX512VL-NEXT: retq
951 ; AVX512BW-LABEL: constant_funnnel_v4i64:
953 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
954 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
955 ; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
956 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
957 ; AVX512BW-NEXT: retq
959 ; AVX512VLBW-LABEL: constant_funnnel_v4i64:
960 ; AVX512VLBW: # %bb.0:
961 ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
962 ; AVX512VLBW-NEXT: retq
964 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
966 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
967 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
968 ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
969 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
972 ; XOPAVX2-LABEL: constant_funnnel_v4i64:
974 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
975 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
976 ; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
977 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
979 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
983 define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
984 ; AVX1-LABEL: constant_funnnel_v8i32:
986 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
987 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
988 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
989 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
990 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
991 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
992 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
993 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
994 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
995 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
996 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
997 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
998 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
999 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1000 ; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
1001 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1002 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1003 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1004 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1005 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1006 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1007 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1010 ; AVX2-LABEL: constant_funnnel_v8i32:
1012 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1013 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1014 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1017 ; AVX512F-LABEL: constant_funnnel_v8i32:
1019 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1020 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1021 ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1022 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1023 ; AVX512F-NEXT: retq
1025 ; AVX512VL-LABEL: constant_funnnel_v8i32:
1026 ; AVX512VL: # %bb.0:
1027 ; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1028 ; AVX512VL-NEXT: retq
1030 ; AVX512BW-LABEL: constant_funnnel_v8i32:
1031 ; AVX512BW: # %bb.0:
1032 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1033 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1034 ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
1035 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1036 ; AVX512BW-NEXT: retq
1038 ; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1039 ; AVX512VLBW: # %bb.0:
1040 ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
1041 ; AVX512VLBW-NEXT: retq
1043 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
1045 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1046 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1047 ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1048 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1049 ; XOPAVX1-NEXT: retq
1051 ; XOPAVX2-LABEL: constant_funnnel_v8i32:
1053 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
1054 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1055 ; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
1056 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1057 ; XOPAVX2-NEXT: retq
1058 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1062 define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1063 ; AVX1-LABEL: constant_funnnel_v16i16:
1065 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1066 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1067 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
1068 ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
1069 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1070 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1071 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3
1072 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
1073 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1074 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1077 ; AVX2-LABEL: constant_funnnel_v16i16:
1079 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1080 ; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1081 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1082 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
1085 ; AVX512F-LABEL: constant_funnnel_v16i16:
1087 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1088 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1089 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1090 ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
1091 ; AVX512F-NEXT: retq
1093 ; AVX512VL-LABEL: constant_funnnel_v16i16:
1094 ; AVX512VL: # %bb.0:
1095 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1096 ; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
1097 ; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1098 ; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
1099 ; AVX512VL-NEXT: retq
1101 ; AVX512BW-LABEL: constant_funnnel_v16i16:
1102 ; AVX512BW: # %bb.0:
1103 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1104 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1105 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1106 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1107 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1108 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1109 ; AVX512BW-NEXT: retq
1111 ; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1112 ; AVX512VLBW: # %bb.0:
1113 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1114 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1115 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1116 ; AVX512VLBW-NEXT: retq
1118 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
1120 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1121 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1123 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1124 ; XOPAVX1-NEXT: retq
1126 ; XOPAVX2-LABEL: constant_funnnel_v16i16:
1128 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
1129 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1130 ; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
1131 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1132 ; XOPAVX2-NEXT: retq
1133 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1137 define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1138 ; AVX1-LABEL: constant_funnnel_v32i8:
1140 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1141 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
1142 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1143 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1144 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1145 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1146 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1147 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1148 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7
1149 ; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
1150 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
1151 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1152 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1153 ; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
1154 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1155 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1156 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1157 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
1158 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
1159 ; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1
1160 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
1161 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1162 ; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3
1163 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
1164 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1165 ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6
1166 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
1167 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1168 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1169 ; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0
1170 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1171 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
1172 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1173 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1174 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1175 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1178 ; AVX2-LABEL: constant_funnnel_v32i8:
1180 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
1181 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1182 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1183 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
1184 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1185 ; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
1186 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1187 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1188 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1189 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1190 ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1191 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1192 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1193 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1194 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1195 ; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
1196 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1197 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1198 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
1199 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1200 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
1203 ; AVX512F-LABEL: constant_funnnel_v32i8:
1205 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
1206 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1207 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1208 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
1209 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1210 ; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
1211 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1212 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1213 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1214 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1215 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1216 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1217 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1218 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1219 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
1220 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
1221 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1222 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1223 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
1224 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1225 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
1226 ; AVX512F-NEXT: retq
1228 ; AVX512VL-LABEL: constant_funnnel_v32i8:
1229 ; AVX512VL: # %bb.0:
1230 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1231 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1232 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1233 ; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
1234 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1235 ; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
1236 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
1237 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1238 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1239 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1240 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1241 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1242 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1243 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1244 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
1245 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
1246 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1247 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1248 ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
1249 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
1250 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1251 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
1252 ; AVX512VL-NEXT: retq
1254 ; AVX512BW-LABEL: constant_funnnel_v32i8:
1255 ; AVX512BW: # %bb.0:
1256 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1257 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1258 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1259 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
1260 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1261 ; AVX512BW-NEXT: retq
1263 ; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1264 ; AVX512VLBW: # %bb.0:
1265 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1266 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1267 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1268 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
1269 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
1270 ; AVX512VLBW-NEXT: retq
1272 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
1274 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1275 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1276 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
1277 ; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0
1278 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1279 ; XOPAVX1-NEXT: retq
1281 ; XOPAVX2-LABEL: constant_funnnel_v32i8:
1283 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1284 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1285 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
1286 ; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0
1287 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1288 ; XOPAVX2-NEXT: retq
1289 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1294 ; Uniform Constant Shifts
1297 define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1298 ; AVX1-LABEL: splatconstant_funnnel_v4i64:
1300 ; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm1
1301 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1302 ; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm3
1303 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
1304 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0
1305 ; AVX1-NEXT: vpsllq $14, %xmm2, %xmm2
1306 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1307 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1310 ; AVX2-LABEL: splatconstant_funnnel_v4i64:
1312 ; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm1
1313 ; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0
1314 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1317 ; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1319 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1320 ; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0
1321 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1322 ; AVX512F-NEXT: retq
1324 ; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1325 ; AVX512VL: # %bb.0:
1326 ; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
1327 ; AVX512VL-NEXT: retq
1329 ; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1330 ; AVX512BW: # %bb.0:
1331 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1332 ; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
1333 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1334 ; AVX512BW-NEXT: retq
1336 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1337 ; AVX512VLBW: # %bb.0:
1338 ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
1339 ; AVX512VLBW-NEXT: retq
1341 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1343 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
1344 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1345 ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
1346 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1347 ; XOPAVX1-NEXT: retq
1349 ; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1351 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
1352 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1353 ; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
1354 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1355 ; XOPAVX2-NEXT: retq
1356 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1360 define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1361 ; AVX1-LABEL: splatconstant_funnnel_v8i32:
1363 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1364 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2
1365 ; AVX1-NEXT: vpslld $4, %xmm1, %xmm1
1366 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1367 ; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2
1368 ; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
1369 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1370 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1373 ; AVX2-LABEL: splatconstant_funnnel_v8i32:
1375 ; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1
1376 ; AVX2-NEXT: vpslld $4, %ymm0, %ymm0
1377 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1380 ; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1382 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1383 ; AVX512F-NEXT: vprold $4, %zmm0, %zmm0
1384 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1385 ; AVX512F-NEXT: retq
1387 ; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1388 ; AVX512VL: # %bb.0:
1389 ; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
1390 ; AVX512VL-NEXT: retq
1392 ; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1393 ; AVX512BW: # %bb.0:
1394 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1395 ; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
1396 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1397 ; AVX512BW-NEXT: retq
1399 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1400 ; AVX512VLBW: # %bb.0:
1401 ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
1402 ; AVX512VLBW-NEXT: retq
1404 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1406 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
1407 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1408 ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
1409 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1410 ; XOPAVX1-NEXT: retq
1412 ; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1414 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
1415 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1416 ; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
1417 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1418 ; XOPAVX2-NEXT: retq
1419 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1423 define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1424 ; AVX1-LABEL: splatconstant_funnnel_v16i16:
1426 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1427 ; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2
1428 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
1429 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1430 ; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2
1431 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
1432 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1433 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1436 ; AVX2-LABEL: splatconstant_funnnel_v16i16:
1438 ; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1
1439 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
1440 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1443 ; AVX512-LABEL: splatconstant_funnnel_v16i16:
1445 ; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm1
1446 ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
1447 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
1450 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1452 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
1453 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1454 ; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
1455 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1456 ; XOPAVX1-NEXT: retq
1458 ; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1460 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
1461 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1462 ; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
1463 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1464 ; XOPAVX2-NEXT: retq
1465 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1469 define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1470 ; AVX1-LABEL: splatconstant_funnnel_v32i8:
1472 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1473 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
1474 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1475 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1476 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
1477 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1478 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
1479 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
1480 ; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
1481 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
1482 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1483 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
1484 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1487 ; AVX2-LABEL: splatconstant_funnnel_v32i8:
1489 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
1490 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1491 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0
1492 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1493 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
1496 ; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1498 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1
1499 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1500 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
1501 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1502 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
1503 ; AVX512F-NEXT: retq
1505 ; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1506 ; AVX512VL: # %bb.0:
1507 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
1508 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
1509 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1510 ; AVX512VL-NEXT: retq
1512 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1513 ; AVX512BW: # %bb.0:
1514 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm1
1515 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
1516 ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0
1517 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1518 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1519 ; AVX512BW-NEXT: retq
1521 ; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1522 ; AVX512VLBW: # %bb.0:
1523 ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
1524 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
1525 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1526 ; AVX512VLBW-NEXT: retq
1528 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1530 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
1531 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1532 ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
1533 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1534 ; XOPAVX1-NEXT: retq
1536 ; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1538 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
1539 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1540 ; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
1541 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1542 ; XOPAVX2-NEXT: retq
1543 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)