1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
18 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
31 ; SSE2-NEXT: movdqa %xmm2, %xmm4
32 ; SSE2-NEXT: pandn %xmm3, %xmm4
33 ; SSE2-NEXT: psrlq $1, %xmm1
34 ; SSE2-NEXT: movdqa %xmm1, %xmm5
35 ; SSE2-NEXT: psrlq %xmm4, %xmm5
36 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
37 ; SSE2-NEXT: psrlq %xmm4, %xmm1
38 ; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
39 ; SSE2-NEXT: pand %xmm3, %xmm2
40 ; SSE2-NEXT: movdqa %xmm0, %xmm1
41 ; SSE2-NEXT: psllq %xmm2, %xmm1
42 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43 ; SSE2-NEXT: psllq %xmm2, %xmm0
44 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
45 ; SSE2-NEXT: orpd %xmm5, %xmm0
48 ; SSE41-LABEL: var_funnnel_v2i64:
50 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
51 ; SSE41-NEXT: movdqa %xmm2, %xmm4
52 ; SSE41-NEXT: pandn %xmm3, %xmm4
53 ; SSE41-NEXT: psrlq $1, %xmm1
54 ; SSE41-NEXT: movdqa %xmm1, %xmm5
55 ; SSE41-NEXT: psrlq %xmm4, %xmm5
56 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
57 ; SSE41-NEXT: psrlq %xmm4, %xmm1
58 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
59 ; SSE41-NEXT: pand %xmm3, %xmm2
60 ; SSE41-NEXT: movdqa %xmm0, %xmm1
61 ; SSE41-NEXT: psllq %xmm2, %xmm1
62 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
63 ; SSE41-NEXT: psllq %xmm2, %xmm0
64 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
65 ; SSE41-NEXT: por %xmm5, %xmm0
68 ; AVX1-LABEL: var_funnnel_v2i64:
70 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
71 ; AVX1-NEXT: # xmm3 = mem[0,0]
72 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
73 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
74 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
75 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
76 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
77 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
78 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
79 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
80 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
81 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
82 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
83 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
86 ; AVX2-LABEL: var_funnnel_v2i64:
88 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
89 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
90 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
91 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
92 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
93 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
94 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
97 ; AVX512F-LABEL: var_funnnel_v2i64:
99 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
100 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
101 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
102 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
103 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
104 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
105 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
108 ; AVX512VL-LABEL: var_funnnel_v2i64:
110 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
111 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
112 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
113 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
114 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
115 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
116 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
117 ; AVX512VL-NEXT: retq
119 ; AVX512BW-LABEL: var_funnnel_v2i64:
121 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
122 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
123 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
124 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
125 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
126 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
127 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
128 ; AVX512BW-NEXT: retq
130 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
131 ; AVX512VBMI2: # %bb.0:
132 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
133 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
134 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
135 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
136 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
137 ; AVX512VBMI2-NEXT: vzeroupper
138 ; AVX512VBMI2-NEXT: retq
140 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
141 ; AVX512VLBW: # %bb.0:
142 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
143 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
144 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1
145 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
146 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
147 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
148 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
149 ; AVX512VLBW-NEXT: retq
151 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
152 ; AVX512VLVBMI2: # %bb.0:
153 ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0
154 ; AVX512VLVBMI2-NEXT: retq
156 ; XOPAVX1-LABEL: var_funnnel_v2i64:
158 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
159 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
160 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
161 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
162 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
163 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
164 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
165 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
166 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
167 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
170 ; XOPAVX2-LABEL: var_funnnel_v2i64:
172 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
173 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
174 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
175 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
176 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
177 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
178 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
181 ; X86-SSE2-LABEL: var_funnnel_v2i64:
183 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
184 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
185 ; X86-SSE2-NEXT: pandn %xmm4, %xmm5
186 ; X86-SSE2-NEXT: psrlq $1, %xmm1
187 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
188 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
189 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
190 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
191 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
192 ; X86-SSE2-NEXT: pand %xmm4, %xmm2
193 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
194 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
195 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
196 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
197 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
198 ; X86-SSE2-NEXT: orpd %xmm3, %xmm0
199 ; X86-SSE2-NEXT: retl
200 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
204 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
205 ; SSE2-LABEL: var_funnnel_v4i32:
207 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
208 ; SSE2-NEXT: movdqa %xmm2, %xmm5
209 ; SSE2-NEXT: pandn %xmm4, %xmm5
210 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
211 ; SSE2-NEXT: psrld $1, %xmm1
212 ; SSE2-NEXT: movdqa %xmm1, %xmm6
213 ; SSE2-NEXT: psrld %xmm3, %xmm6
214 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
215 ; SSE2-NEXT: movdqa %xmm1, %xmm3
216 ; SSE2-NEXT: psrld %xmm7, %xmm3
217 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
218 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
219 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
220 ; SSE2-NEXT: movdqa %xmm1, %xmm7
221 ; SSE2-NEXT: psrld %xmm6, %xmm7
222 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
223 ; SSE2-NEXT: psrld %xmm5, %xmm1
224 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
225 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
226 ; SSE2-NEXT: pand %xmm4, %xmm2
227 ; SSE2-NEXT: pslld $23, %xmm2
228 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
229 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
230 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
231 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
232 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
233 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
234 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
235 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
236 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
237 ; SSE2-NEXT: por %xmm3, %xmm0
240 ; SSE41-LABEL: var_funnnel_v4i32:
242 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
243 ; SSE41-NEXT: movdqa %xmm2, %xmm4
244 ; SSE41-NEXT: pandn %xmm3, %xmm4
245 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
246 ; SSE41-NEXT: psrld $1, %xmm1
247 ; SSE41-NEXT: movdqa %xmm1, %xmm6
248 ; SSE41-NEXT: psrld %xmm5, %xmm6
249 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
250 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
251 ; SSE41-NEXT: movdqa %xmm1, %xmm8
252 ; SSE41-NEXT: psrld %xmm7, %xmm8
253 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
254 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
255 ; SSE41-NEXT: movdqa %xmm1, %xmm6
256 ; SSE41-NEXT: psrld %xmm4, %xmm6
257 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
258 ; SSE41-NEXT: psrld %xmm4, %xmm1
259 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
260 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
261 ; SSE41-NEXT: pand %xmm3, %xmm2
262 ; SSE41-NEXT: pslld $23, %xmm2
263 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
264 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
265 ; SSE41-NEXT: pmulld %xmm1, %xmm0
266 ; SSE41-NEXT: por %xmm6, %xmm0
269 ; AVX1-LABEL: var_funnnel_v4i32:
271 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
272 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
273 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
274 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
275 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
276 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
277 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
278 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
279 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
280 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
281 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
282 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
283 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
284 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
285 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
286 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
287 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
288 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
289 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
290 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
291 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
294 ; AVX2-LABEL: var_funnnel_v4i32:
296 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
297 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
298 ; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
299 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
300 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
301 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
302 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
305 ; AVX512F-LABEL: var_funnnel_v4i32:
307 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
308 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
309 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
310 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
311 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
312 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
313 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
316 ; AVX512VL-LABEL: var_funnnel_v4i32:
318 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
319 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
320 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
321 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
323 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
324 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
325 ; AVX512VL-NEXT: retq
327 ; AVX512BW-LABEL: var_funnnel_v4i32:
329 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
330 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
331 ; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
332 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
333 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
334 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
335 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
336 ; AVX512BW-NEXT: retq
338 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
339 ; AVX512VBMI2: # %bb.0:
340 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
341 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
342 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
343 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
344 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
345 ; AVX512VBMI2-NEXT: vzeroupper
346 ; AVX512VBMI2-NEXT: retq
348 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
349 ; AVX512VLBW: # %bb.0:
350 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
351 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
352 ; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
353 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
354 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
355 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
356 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
357 ; AVX512VLBW-NEXT: retq
359 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
360 ; AVX512VLVBMI2: # %bb.0:
361 ; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
362 ; AVX512VLVBMI2-NEXT: retq
364 ; XOPAVX1-LABEL: var_funnnel_v4i32:
366 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
367 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
368 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
369 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
370 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
371 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
372 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
373 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
374 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
377 ; XOPAVX2-LABEL: var_funnnel_v4i32:
379 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
380 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
381 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
382 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
383 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
384 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
385 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
388 ; X86-SSE2-LABEL: var_funnnel_v4i32:
390 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
391 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
392 ; X86-SSE2-NEXT: pandn %xmm4, %xmm5
393 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
394 ; X86-SSE2-NEXT: psrld $1, %xmm1
395 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
396 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
397 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
398 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
399 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
400 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
401 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
402 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
403 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
404 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
405 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
406 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
407 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
408 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
409 ; X86-SSE2-NEXT: pand %xmm4, %xmm2
410 ; X86-SSE2-NEXT: pslld $23, %xmm2
411 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
412 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
413 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
414 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
415 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
416 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
417 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
418 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
419 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
420 ; X86-SSE2-NEXT: por %xmm3, %xmm0
421 ; X86-SSE2-NEXT: retl
422 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
426 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
427 ; SSE2-LABEL: var_funnnel_v8i16:
429 ; SSE2-NEXT: movdqa %xmm1, %xmm3
430 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
431 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
432 ; SSE2-NEXT: movdqa %xmm2, %xmm4
433 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
434 ; SSE2-NEXT: pslld $23, %xmm4
435 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
436 ; SSE2-NEXT: paddd %xmm5, %xmm4
437 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
438 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
439 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
440 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
441 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
442 ; SSE2-NEXT: pmuludq %xmm6, %xmm4
443 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
444 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
445 ; SSE2-NEXT: psrad $16, %xmm3
446 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
447 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
448 ; SSE2-NEXT: pslld $23, %xmm2
449 ; SSE2-NEXT: paddd %xmm5, %xmm2
450 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
451 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
452 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
453 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
454 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
455 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
456 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
457 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458 ; SSE2-NEXT: psrad $16, %xmm0
459 ; SSE2-NEXT: packssdw %xmm3, %xmm0
462 ; SSE41-LABEL: var_funnnel_v8i16:
464 ; SSE41-NEXT: movdqa %xmm1, %xmm3
465 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
466 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
467 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
468 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
469 ; SSE41-NEXT: pslld $23, %xmm2
470 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
471 ; SSE41-NEXT: paddd %xmm5, %xmm2
472 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
473 ; SSE41-NEXT: pmulld %xmm3, %xmm2
474 ; SSE41-NEXT: psrld $16, %xmm2
475 ; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
476 ; SSE41-NEXT: pslld $23, %xmm4
477 ; SSE41-NEXT: paddd %xmm5, %xmm4
478 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm0
479 ; SSE41-NEXT: pmulld %xmm1, %xmm0
480 ; SSE41-NEXT: psrld $16, %xmm0
481 ; SSE41-NEXT: packusdw %xmm2, %xmm0
484 ; AVX1-LABEL: var_funnnel_v8i16:
486 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
487 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
488 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
489 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
490 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
491 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
492 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
493 ; AVX1-NEXT: vpmulld %xmm4, %xmm3, %xmm3
494 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3
495 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
496 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
497 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
498 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
499 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
500 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
501 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
502 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
505 ; AVX2-LABEL: var_funnnel_v8i16:
507 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
508 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
509 ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
510 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
511 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
512 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
513 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
514 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
515 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
516 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
517 ; AVX2-NEXT: vzeroupper
520 ; AVX512F-LABEL: var_funnnel_v8i16:
522 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
523 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
524 ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0
525 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
526 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
527 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
528 ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
529 ; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
530 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
531 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
532 ; AVX512F-NEXT: vzeroupper
535 ; AVX512VL-LABEL: var_funnnel_v8i16:
537 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
538 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
539 ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0
540 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
541 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
542 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
543 ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
544 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
545 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
546 ; AVX512VL-NEXT: vzeroupper
547 ; AVX512VL-NEXT: retq
549 ; AVX512BW-LABEL: var_funnnel_v8i16:
551 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
552 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
553 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
554 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
555 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
556 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
557 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
558 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
559 ; AVX512BW-NEXT: vzeroupper
560 ; AVX512BW-NEXT: retq
562 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
563 ; AVX512VBMI2: # %bb.0:
564 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
565 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
566 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
567 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
568 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
569 ; AVX512VBMI2-NEXT: vzeroupper
570 ; AVX512VBMI2-NEXT: retq
572 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
573 ; AVX512VLBW: # %bb.0:
574 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
575 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
576 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
577 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
578 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
579 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
580 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
581 ; AVX512VLBW-NEXT: retq
583 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
584 ; AVX512VLVBMI2: # %bb.0:
585 ; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0
586 ; AVX512VLVBMI2-NEXT: retq
588 ; XOPAVX1-LABEL: var_funnnel_v8i16:
590 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
591 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
592 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
593 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
594 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
595 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
596 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
597 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
598 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
601 ; XOPAVX2-LABEL: var_funnnel_v8i16:
603 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
604 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
605 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
606 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
607 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
608 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
609 ; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1
610 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
611 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
614 ; X86-SSE2-LABEL: var_funnnel_v8i16:
616 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
617 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
618 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
619 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
620 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
621 ; X86-SSE2-NEXT: pslld $23, %xmm5
622 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
623 ; X86-SSE2-NEXT: paddd %xmm4, %xmm5
624 ; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5
625 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
626 ; X86-SSE2-NEXT: pmuludq %xmm5, %xmm3
627 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
628 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
629 ; X86-SSE2-NEXT: pmuludq %xmm6, %xmm5
630 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
631 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
632 ; X86-SSE2-NEXT: psrad $16, %xmm3
633 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
634 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
635 ; X86-SSE2-NEXT: pslld $23, %xmm2
636 ; X86-SSE2-NEXT: paddd %xmm4, %xmm2
637 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
638 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
639 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
640 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
641 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
642 ; X86-SSE2-NEXT: pmuludq %xmm4, %xmm1
643 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
644 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
645 ; X86-SSE2-NEXT: psrad $16, %xmm0
646 ; X86-SSE2-NEXT: packssdw %xmm3, %xmm0
647 ; X86-SSE2-NEXT: retl
648 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
652 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
653 ; SSE2-LABEL: var_funnnel_v16i8:
655 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
656 ; SSE2-NEXT: pxor %xmm5, %xmm5
657 ; SSE2-NEXT: movdqa %xmm2, %xmm4
658 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
659 ; SSE2-NEXT: movdqa %xmm4, %xmm6
660 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
661 ; SSE2-NEXT: pslld $23, %xmm6
662 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
663 ; SSE2-NEXT: paddd %xmm3, %xmm6
664 ; SSE2-NEXT: cvttps2dq %xmm6, %xmm6
665 ; SSE2-NEXT: pslld $16, %xmm6
666 ; SSE2-NEXT: psrad $16, %xmm6
667 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
668 ; SSE2-NEXT: pslld $23, %xmm4
669 ; SSE2-NEXT: paddd %xmm3, %xmm4
670 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm7
671 ; SSE2-NEXT: pslld $16, %xmm7
672 ; SSE2-NEXT: psrad $16, %xmm7
673 ; SSE2-NEXT: packssdw %xmm6, %xmm7
674 ; SSE2-NEXT: movdqa %xmm1, %xmm4
675 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
676 ; SSE2-NEXT: pmullw %xmm7, %xmm4
677 ; SSE2-NEXT: psrlw $8, %xmm4
678 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
679 ; SSE2-NEXT: movdqa %xmm2, %xmm5
680 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
681 ; SSE2-NEXT: pslld $23, %xmm5
682 ; SSE2-NEXT: paddd %xmm3, %xmm5
683 ; SSE2-NEXT: cvttps2dq %xmm5, %xmm5
684 ; SSE2-NEXT: pslld $16, %xmm5
685 ; SSE2-NEXT: psrad $16, %xmm5
686 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
687 ; SSE2-NEXT: pslld $23, %xmm2
688 ; SSE2-NEXT: paddd %xmm3, %xmm2
689 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
690 ; SSE2-NEXT: pslld $16, %xmm2
691 ; SSE2-NEXT: psrad $16, %xmm2
692 ; SSE2-NEXT: packssdw %xmm5, %xmm2
693 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
694 ; SSE2-NEXT: pmullw %xmm1, %xmm2
695 ; SSE2-NEXT: psrlw $8, %xmm2
696 ; SSE2-NEXT: packuswb %xmm4, %xmm2
697 ; SSE2-NEXT: movdqa %xmm2, %xmm0
700 ; SSE41-LABEL: var_funnnel_v16i8:
702 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
703 ; SSE41-NEXT: pxor %xmm3, %xmm3
704 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
705 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
706 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
707 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
708 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
709 ; SSE41-NEXT: pslld $23, %xmm2
710 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
711 ; SSE41-NEXT: paddd %xmm6, %xmm2
712 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
713 ; SSE41-NEXT: pslld $23, %xmm3
714 ; SSE41-NEXT: paddd %xmm6, %xmm3
715 ; SSE41-NEXT: cvttps2dq %xmm3, %xmm3
716 ; SSE41-NEXT: packusdw %xmm2, %xmm3
717 ; SSE41-NEXT: movdqa %xmm1, %xmm7
718 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
719 ; SSE41-NEXT: pmullw %xmm3, %xmm7
720 ; SSE41-NEXT: psrlw $8, %xmm7
721 ; SSE41-NEXT: pslld $23, %xmm4
722 ; SSE41-NEXT: paddd %xmm6, %xmm4
723 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm2
724 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
725 ; SSE41-NEXT: pslld $23, %xmm5
726 ; SSE41-NEXT: paddd %xmm6, %xmm5
727 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm3
728 ; SSE41-NEXT: packusdw %xmm3, %xmm2
729 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
730 ; SSE41-NEXT: pmullw %xmm1, %xmm2
731 ; SSE41-NEXT: psrlw $8, %xmm2
732 ; SSE41-NEXT: packuswb %xmm7, %xmm2
733 ; SSE41-NEXT: movdqa %xmm2, %xmm0
736 ; AVX1-LABEL: var_funnnel_v16i8:
738 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
739 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
740 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
741 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
742 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
743 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
744 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
745 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
746 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
747 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
748 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
749 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
750 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
751 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
752 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
753 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
754 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
755 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
756 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
757 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
758 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
759 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
760 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
761 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
762 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
763 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
764 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
765 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
766 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
767 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
770 ; AVX2-LABEL: var_funnnel_v16i8:
772 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
773 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
774 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
775 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
776 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
777 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
778 ; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
779 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
780 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
781 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
782 ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
783 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
784 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
785 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
786 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
787 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
788 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
789 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
790 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
791 ; AVX2-NEXT: vzeroupper
794 ; AVX512F-LABEL: var_funnnel_v16i8:
796 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
797 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
798 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
799 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
800 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
801 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
802 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
803 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
804 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
805 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
806 ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
807 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
808 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
809 ; AVX512F-NEXT: vzeroupper
812 ; AVX512VL-LABEL: var_funnnel_v16i8:
814 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
815 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
816 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
817 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
818 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
819 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
820 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
821 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
822 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
823 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
824 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
825 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
826 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
827 ; AVX512VL-NEXT: vzeroupper
828 ; AVX512VL-NEXT: retq
830 ; AVX512BW-LABEL: var_funnnel_v16i8:
832 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
833 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
834 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
835 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
836 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
837 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
838 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
839 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
840 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
841 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
842 ; AVX512BW-NEXT: vzeroupper
843 ; AVX512BW-NEXT: retq
845 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
846 ; AVX512VBMI2: # %bb.0:
847 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
848 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
849 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
850 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1
851 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
852 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
853 ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0
854 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
855 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
856 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
857 ; AVX512VBMI2-NEXT: vzeroupper
858 ; AVX512VBMI2-NEXT: retq
860 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
861 ; AVX512VLBW: # %bb.0:
862 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
863 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
864 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
865 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
866 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
867 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
868 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
869 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
870 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
871 ; AVX512VLBW-NEXT: vzeroupper
872 ; AVX512VLBW-NEXT: retq
874 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
875 ; AVX512VLVBMI2: # %bb.0:
876 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
877 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
878 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
879 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
880 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
881 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
882 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm0, %ymm3, %ymm0
883 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
884 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
885 ; AVX512VLVBMI2-NEXT: vzeroupper
886 ; AVX512VLVBMI2-NEXT: retq
888 ; XOPAVX1-LABEL: var_funnnel_v16i8:
890 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
891 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
892 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
893 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
894 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
895 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
896 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
897 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
898 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
899 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
902 ; XOPAVX2-LABEL: var_funnnel_v16i8:
904 ; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
905 ; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1
906 ; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
907 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
908 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
909 ; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
910 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
911 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
912 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
913 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
916 ; X86-SSE2-LABEL: var_funnnel_v16i8:
918 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
919 ; X86-SSE2-NEXT: pxor %xmm5, %xmm5
920 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
921 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
922 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm6
923 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
924 ; X86-SSE2-NEXT: pslld $23, %xmm6
925 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
926 ; X86-SSE2-NEXT: paddd %xmm3, %xmm6
927 ; X86-SSE2-NEXT: cvttps2dq %xmm6, %xmm6
928 ; X86-SSE2-NEXT: pslld $16, %xmm6
929 ; X86-SSE2-NEXT: psrad $16, %xmm6
930 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
931 ; X86-SSE2-NEXT: pslld $23, %xmm4
932 ; X86-SSE2-NEXT: paddd %xmm3, %xmm4
933 ; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm7
934 ; X86-SSE2-NEXT: pslld $16, %xmm7
935 ; X86-SSE2-NEXT: psrad $16, %xmm7
936 ; X86-SSE2-NEXT: packssdw %xmm6, %xmm7
937 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
938 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
939 ; X86-SSE2-NEXT: pmullw %xmm7, %xmm4
940 ; X86-SSE2-NEXT: psrlw $8, %xmm4
941 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
942 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
943 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
944 ; X86-SSE2-NEXT: pslld $23, %xmm5
945 ; X86-SSE2-NEXT: paddd %xmm3, %xmm5
946 ; X86-SSE2-NEXT: cvttps2dq %xmm5, %xmm5
947 ; X86-SSE2-NEXT: pslld $16, %xmm5
948 ; X86-SSE2-NEXT: psrad $16, %xmm5
949 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
950 ; X86-SSE2-NEXT: pslld $23, %xmm2
951 ; X86-SSE2-NEXT: paddd %xmm3, %xmm2
952 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
953 ; X86-SSE2-NEXT: pslld $16, %xmm2
954 ; X86-SSE2-NEXT: psrad $16, %xmm2
955 ; X86-SSE2-NEXT: packssdw %xmm5, %xmm2
956 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
957 ; X86-SSE2-NEXT: pmullw %xmm1, %xmm2
958 ; X86-SSE2-NEXT: psrlw $8, %xmm2
959 ; X86-SSE2-NEXT: packuswb %xmm4, %xmm2
960 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
961 ; X86-SSE2-NEXT: retl
962 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
967 ; Uniform Variable Shifts
970 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
971 ; SSE-LABEL: splatvar_funnnel_v2i64:
973 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
974 ; SSE-NEXT: movdqa %xmm2, %xmm4
975 ; SSE-NEXT: pandn %xmm3, %xmm4
976 ; SSE-NEXT: psrlq $1, %xmm1
977 ; SSE-NEXT: psrlq %xmm4, %xmm1
978 ; SSE-NEXT: pand %xmm3, %xmm2
979 ; SSE-NEXT: psllq %xmm2, %xmm0
980 ; SSE-NEXT: por %xmm1, %xmm0
983 ; AVX1-LABEL: splatvar_funnnel_v2i64:
985 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
986 ; AVX1-NEXT: # xmm3 = mem[0,0]
987 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
988 ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
989 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
990 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
991 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
992 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
995 ; AVX2-LABEL: splatvar_funnnel_v2i64:
997 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
998 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
999 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
1000 ; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1001 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1002 ; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1003 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1006 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1008 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1009 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
1010 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
1011 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1012 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
1013 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1014 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1015 ; AVX512F-NEXT: retq
1017 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1018 ; AVX512VL: # %bb.0:
1019 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1020 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
1021 ; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
1022 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1023 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
1024 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1025 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1026 ; AVX512VL-NEXT: retq
1028 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1029 ; AVX512BW: # %bb.0:
1030 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1031 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1032 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
1033 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1034 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1035 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1036 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1037 ; AVX512BW-NEXT: retq
1039 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1040 ; AVX512VBMI2: # %bb.0:
1041 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1042 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1043 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1044 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
1045 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1046 ; AVX512VBMI2-NEXT: vzeroupper
1047 ; AVX512VBMI2-NEXT: retq
1049 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1050 ; AVX512VLBW: # %bb.0:
1051 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1052 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1053 ; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1
1054 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1055 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1056 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1057 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1058 ; AVX512VLBW-NEXT: retq
1060 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1061 ; AVX512VLVBMI2: # %bb.0:
1062 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1063 ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0
1064 ; AVX512VLVBMI2-NEXT: retq
1066 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1068 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
1069 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
1070 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
1071 ; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
1072 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1073 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1074 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1075 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1076 ; XOPAVX1-NEXT: retq
1078 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1080 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1081 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1082 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
1083 ; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1084 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1085 ; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1086 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1087 ; XOPAVX2-NEXT: retq
1089 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1090 ; X86-SSE2: # %bb.0:
1091 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
1092 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1093 ; X86-SSE2-NEXT: pandn %xmm3, %xmm4
1094 ; X86-SSE2-NEXT: psrlq $1, %xmm1
1095 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1096 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
1097 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1098 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1099 ; X86-SSE2-NEXT: retl
1100 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1101 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1105 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1106 ; SSE-LABEL: splatvar_funnnel_v4i32:
1108 ; SSE-NEXT: movdqa %xmm1, %xmm3
1109 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1110 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1111 ; SSE-NEXT: psllq %xmm2, %xmm3
1112 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1113 ; SSE-NEXT: psllq %xmm2, %xmm1
1114 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
1115 ; SSE-NEXT: movaps %xmm1, %xmm0
1118 ; AVX-LABEL: splatvar_funnnel_v4i32:
1120 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1121 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1122 ; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1123 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1124 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1125 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1128 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1130 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1131 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1132 ; AVX512F-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1133 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1134 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1135 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1136 ; AVX512F-NEXT: retq
1138 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1139 ; AVX512VL: # %bb.0:
1140 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1141 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1142 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1143 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1144 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1145 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1146 ; AVX512VL-NEXT: retq
1148 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1149 ; AVX512BW: # %bb.0:
1150 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1151 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1152 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1153 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1154 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1155 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1156 ; AVX512BW-NEXT: retq
1158 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1159 ; AVX512VBMI2: # %bb.0:
1160 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1161 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1162 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1163 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
1164 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1165 ; AVX512VBMI2-NEXT: vzeroupper
1166 ; AVX512VBMI2-NEXT: retq
1168 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1169 ; AVX512VLBW: # %bb.0:
1170 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1171 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1172 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1173 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1174 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1175 ; AVX512VLBW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1176 ; AVX512VLBW-NEXT: retq
1178 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1179 ; AVX512VLVBMI2: # %bb.0:
1180 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1181 ; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
1182 ; AVX512VLVBMI2-NEXT: retq
1184 ; XOP-LABEL: splatvar_funnnel_v4i32:
1186 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1187 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1188 ; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3
1189 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1190 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1191 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1194 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1195 ; X86-SSE2: # %bb.0:
1196 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1197 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1198 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1199 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3
1200 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1201 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
1202 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
1203 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
1204 ; X86-SSE2-NEXT: retl
1205 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1206 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1210 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1211 ; SSE-LABEL: splatvar_funnnel_v8i16:
1213 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1214 ; SSE-NEXT: movdqa %xmm2, %xmm4
1215 ; SSE-NEXT: pandn %xmm3, %xmm4
1216 ; SSE-NEXT: psrlw $1, %xmm1
1217 ; SSE-NEXT: psrlw %xmm4, %xmm1
1218 ; SSE-NEXT: pand %xmm3, %xmm2
1219 ; SSE-NEXT: psllw %xmm2, %xmm0
1220 ; SSE-NEXT: por %xmm1, %xmm0
1223 ; AVX-LABEL: splatvar_funnnel_v8i16:
1225 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1226 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
1227 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
1228 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1229 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
1230 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1231 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1234 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1236 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1237 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
1238 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
1239 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1240 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
1241 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1242 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1243 ; AVX512F-NEXT: retq
1245 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1246 ; AVX512VL: # %bb.0:
1247 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1248 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
1249 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
1250 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1251 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
1252 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1253 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1254 ; AVX512VL-NEXT: retq
1256 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1257 ; AVX512BW: # %bb.0:
1258 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1259 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1260 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
1261 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1262 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1263 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1264 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1265 ; AVX512BW-NEXT: retq
1267 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1268 ; AVX512VBMI2: # %bb.0:
1269 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1270 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1271 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1272 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
1273 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1274 ; AVX512VBMI2-NEXT: vzeroupper
1275 ; AVX512VBMI2-NEXT: retq
1277 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1278 ; AVX512VLBW: # %bb.0:
1279 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1280 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
1281 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
1282 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1283 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
1284 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1285 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1286 ; AVX512VLBW-NEXT: retq
1288 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1289 ; AVX512VLVBMI2: # %bb.0:
1290 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1291 ; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0
1292 ; AVX512VLVBMI2-NEXT: retq
1294 ; XOP-LABEL: splatvar_funnnel_v8i16:
1296 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1297 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
1298 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
1299 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1300 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
1301 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1302 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1305 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1306 ; X86-SSE2: # %bb.0:
1307 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1308 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1309 ; X86-SSE2-NEXT: pandn %xmm3, %xmm4
1310 ; X86-SSE2-NEXT: psrlw $1, %xmm1
1311 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
1312 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
1313 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1314 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1315 ; X86-SSE2-NEXT: retl
1316 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1317 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1321 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1322 ; SSE-LABEL: splatvar_funnnel_v16i8:
1324 ; SSE-NEXT: movdqa %xmm1, %xmm3
1325 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1326 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1327 ; SSE-NEXT: psllw %xmm2, %xmm3
1328 ; SSE-NEXT: psrlw $8, %xmm3
1329 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1330 ; SSE-NEXT: psllw %xmm2, %xmm1
1331 ; SSE-NEXT: psrlw $8, %xmm1
1332 ; SSE-NEXT: packuswb %xmm3, %xmm1
1333 ; SSE-NEXT: movdqa %xmm1, %xmm0
1336 ; AVX-LABEL: splatvar_funnnel_v16i8:
1338 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1339 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1340 ; AVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1341 ; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3
1342 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1343 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1344 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
1345 ; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1348 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1350 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1351 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1352 ; AVX512F-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1353 ; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3
1354 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1355 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1356 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
1357 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1358 ; AVX512F-NEXT: retq
1360 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1361 ; AVX512VL: # %bb.0:
1362 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1363 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1364 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1365 ; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3
1366 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1367 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1368 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
1369 ; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1370 ; AVX512VL-NEXT: retq
1372 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1373 ; AVX512BW: # %bb.0:
1374 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1375 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1376 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1377 ; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
1378 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1379 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1380 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
1381 ; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1382 ; AVX512BW-NEXT: retq
1384 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1385 ; AVX512VBMI2: # %bb.0:
1386 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1387 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1388 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1389 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
1390 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1391 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1392 ; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
1393 ; AVX512VBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1394 ; AVX512VBMI2-NEXT: retq
1396 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1397 ; AVX512VLBW: # %bb.0:
1398 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1399 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1400 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1401 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
1402 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1403 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1404 ; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
1405 ; AVX512VLBW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1406 ; AVX512VLBW-NEXT: retq
1408 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1409 ; AVX512VLVBMI2: # %bb.0:
1410 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1411 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1412 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1413 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
1414 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1415 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1416 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
1417 ; AVX512VLVBMI2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1418 ; AVX512VLVBMI2-NEXT: retq
1420 ; XOP-LABEL: splatvar_funnnel_v16i8:
1422 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1423 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1424 ; XOP-NEXT: vpsllw %xmm2, %xmm3, %xmm3
1425 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1426 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1427 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm3[1,3,5,7,9,11,13,15]
1430 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1431 ; X86-SSE2: # %bb.0:
1432 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1433 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1434 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1435 ; X86-SSE2-NEXT: psllw %xmm2, %xmm3
1436 ; X86-SSE2-NEXT: psrlw $8, %xmm3
1437 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1438 ; X86-SSE2-NEXT: psllw %xmm2, %xmm1
1439 ; X86-SSE2-NEXT: psrlw $8, %xmm1
1440 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
1441 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
1442 ; X86-SSE2-NEXT: retl
1443 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1444 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1448 ; CGP should allow a cross-block splat shift amount to be seen in SDAG.
1449 ; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
1451 define void @sink_splatvar(ptr %p, i32 %shift_amt) {
1452 ; SSE-LABEL: sink_splatvar:
1453 ; SSE: # %bb.0: # %entry
1454 ; SSE-NEXT: movd %esi, %xmm0
1455 ; SSE-NEXT: movq $-1024, %rax # imm = 0xFC00
1456 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1457 ; SSE-NEXT: .p2align 4, 0x90
1458 ; SSE-NEXT: .LBB8_1: # %loop
1459 ; SSE-NEXT: # =>This Inner Loop Header: Depth=1
1460 ; SSE-NEXT: movdqu 1024(%rdi,%rax), %xmm1
1461 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1462 ; SSE-NEXT: psllq %xmm0, %xmm2
1463 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1464 ; SSE-NEXT: psllq %xmm0, %xmm1
1465 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1466 ; SSE-NEXT: movups %xmm1, 1024(%rdi,%rax)
1467 ; SSE-NEXT: addq $16, %rax
1468 ; SSE-NEXT: jne .LBB8_1
1469 ; SSE-NEXT: # %bb.2: # %end
1472 ; AVX1-LABEL: sink_splatvar:
1473 ; AVX1: # %bb.0: # %entry
1474 ; AVX1-NEXT: vmovd %esi, %xmm0
1475 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1476 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1477 ; AVX1-NEXT: .p2align 4, 0x90
1478 ; AVX1-NEXT: .LBB8_1: # %loop
1479 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
1480 ; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1481 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1482 ; AVX1-NEXT: vpsllq %xmm0, %xmm2, %xmm2
1483 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1484 ; AVX1-NEXT: vpsllq %xmm0, %xmm1, %xmm1
1485 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1486 ; AVX1-NEXT: vmovups %xmm1, 1024(%rdi,%rax)
1487 ; AVX1-NEXT: addq $16, %rax
1488 ; AVX1-NEXT: jne .LBB8_1
1489 ; AVX1-NEXT: # %bb.2: # %end
1492 ; AVX2-LABEL: sink_splatvar:
1493 ; AVX2: # %bb.0: # %entry
1494 ; AVX2-NEXT: vmovd %esi, %xmm0
1495 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
1496 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1497 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
1498 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1499 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
1500 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm1
1501 ; AVX2-NEXT: .p2align 4, 0x90
1502 ; AVX2-NEXT: .LBB8_1: # %loop
1503 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
1504 ; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
1505 ; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3
1506 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2
1507 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
1508 ; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
1509 ; AVX2-NEXT: addq $16, %rax
1510 ; AVX2-NEXT: jne .LBB8_1
1511 ; AVX2-NEXT: # %bb.2: # %end
1514 ; AVX512F-LABEL: sink_splatvar:
1515 ; AVX512F: # %bb.0: # %entry
1516 ; AVX512F-NEXT: vmovd %esi, %xmm0
1517 ; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
1518 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
1519 ; AVX512F-NEXT: .p2align 4, 0x90
1520 ; AVX512F-NEXT: .LBB8_1: # %loop
1521 ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
1522 ; AVX512F-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1523 ; AVX512F-NEXT: vprolvd %zmm0, %zmm1, %zmm1
1524 ; AVX512F-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1525 ; AVX512F-NEXT: addq $16, %rax
1526 ; AVX512F-NEXT: jne .LBB8_1
1527 ; AVX512F-NEXT: # %bb.2: # %end
1528 ; AVX512F-NEXT: vzeroupper
1529 ; AVX512F-NEXT: retq
1531 ; AVX512VL-LABEL: sink_splatvar:
1532 ; AVX512VL: # %bb.0: # %entry
1533 ; AVX512VL-NEXT: vpbroadcastd %esi, %xmm0
1534 ; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00
1535 ; AVX512VL-NEXT: .p2align 4, 0x90
1536 ; AVX512VL-NEXT: .LBB8_1: # %loop
1537 ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
1538 ; AVX512VL-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1539 ; AVX512VL-NEXT: vprolvd %xmm0, %xmm1, %xmm1
1540 ; AVX512VL-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1541 ; AVX512VL-NEXT: addq $16, %rax
1542 ; AVX512VL-NEXT: jne .LBB8_1
1543 ; AVX512VL-NEXT: # %bb.2: # %end
1544 ; AVX512VL-NEXT: retq
1546 ; AVX512BW-LABEL: sink_splatvar:
1547 ; AVX512BW: # %bb.0: # %entry
1548 ; AVX512BW-NEXT: vmovd %esi, %xmm0
1549 ; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
1550 ; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
1551 ; AVX512BW-NEXT: .p2align 4, 0x90
1552 ; AVX512BW-NEXT: .LBB8_1: # %loop
1553 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
1554 ; AVX512BW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1555 ; AVX512BW-NEXT: vprolvd %zmm0, %zmm1, %zmm1
1556 ; AVX512BW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1557 ; AVX512BW-NEXT: addq $16, %rax
1558 ; AVX512BW-NEXT: jne .LBB8_1
1559 ; AVX512BW-NEXT: # %bb.2: # %end
1560 ; AVX512BW-NEXT: vzeroupper
1561 ; AVX512BW-NEXT: retq
1563 ; AVX512VBMI2-LABEL: sink_splatvar:
1564 ; AVX512VBMI2: # %bb.0: # %entry
1565 ; AVX512VBMI2-NEXT: vmovd %esi, %xmm0
1566 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %xmm0
1567 ; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
1568 ; AVX512VBMI2-NEXT: .p2align 4, 0x90
1569 ; AVX512VBMI2-NEXT: .LBB8_1: # %loop
1570 ; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1
1571 ; AVX512VBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1572 ; AVX512VBMI2-NEXT: vprolvd %zmm0, %zmm1, %zmm1
1573 ; AVX512VBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1574 ; AVX512VBMI2-NEXT: addq $16, %rax
1575 ; AVX512VBMI2-NEXT: jne .LBB8_1
1576 ; AVX512VBMI2-NEXT: # %bb.2: # %end
1577 ; AVX512VBMI2-NEXT: vzeroupper
1578 ; AVX512VBMI2-NEXT: retq
1580 ; AVX512VLBW-LABEL: sink_splatvar:
1581 ; AVX512VLBW: # %bb.0: # %entry
1582 ; AVX512VLBW-NEXT: vpbroadcastd %esi, %xmm0
1583 ; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00
1584 ; AVX512VLBW-NEXT: .p2align 4, 0x90
1585 ; AVX512VLBW-NEXT: .LBB8_1: # %loop
1586 ; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1
1587 ; AVX512VLBW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1588 ; AVX512VLBW-NEXT: vprolvd %xmm0, %xmm1, %xmm1
1589 ; AVX512VLBW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1590 ; AVX512VLBW-NEXT: addq $16, %rax
1591 ; AVX512VLBW-NEXT: jne .LBB8_1
1592 ; AVX512VLBW-NEXT: # %bb.2: # %end
1593 ; AVX512VLBW-NEXT: retq
1595 ; AVX512VLVBMI2-LABEL: sink_splatvar:
1596 ; AVX512VLVBMI2: # %bb.0: # %entry
1597 ; AVX512VLVBMI2-NEXT: vpbroadcastd %esi, %xmm0
1598 ; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
1599 ; AVX512VLVBMI2-NEXT: .p2align 4, 0x90
1600 ; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop
1601 ; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1
1602 ; AVX512VLVBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
1603 ; AVX512VLVBMI2-NEXT: vprolvd %xmm0, %xmm1, %xmm1
1604 ; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1605 ; AVX512VLVBMI2-NEXT: addq $16, %rax
1606 ; AVX512VLVBMI2-NEXT: jne .LBB8_1
1607 ; AVX512VLVBMI2-NEXT: # %bb.2: # %end
1608 ; AVX512VLVBMI2-NEXT: retq
1610 ; XOPAVX1-LABEL: sink_splatvar:
1611 ; XOPAVX1: # %bb.0: # %entry
1612 ; XOPAVX1-NEXT: vmovd %esi, %xmm0
1613 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1614 ; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
1615 ; XOPAVX1-NEXT: .p2align 4, 0x90
1616 ; XOPAVX1-NEXT: .LBB8_1: # %loop
1617 ; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1
1618 ; XOPAVX1-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
1619 ; XOPAVX1-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1620 ; XOPAVX1-NEXT: addq $16, %rax
1621 ; XOPAVX1-NEXT: jne .LBB8_1
1622 ; XOPAVX1-NEXT: # %bb.2: # %end
1623 ; XOPAVX1-NEXT: retq
1625 ; XOPAVX2-LABEL: sink_splatvar:
1626 ; XOPAVX2: # %bb.0: # %entry
1627 ; XOPAVX2-NEXT: vmovd %esi, %xmm0
1628 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
1629 ; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
1630 ; XOPAVX2-NEXT: .p2align 4, 0x90
1631 ; XOPAVX2-NEXT: .LBB8_1: # %loop
1632 ; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1
1633 ; XOPAVX2-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
1634 ; XOPAVX2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
1635 ; XOPAVX2-NEXT: addq $16, %rax
1636 ; XOPAVX2-NEXT: jne .LBB8_1
1637 ; XOPAVX2-NEXT: # %bb.2: # %end
1638 ; XOPAVX2-NEXT: retq
1640 ; X86-SSE2-LABEL: sink_splatvar:
1641 ; X86-SSE2: # %bb.0: # %entry
1642 ; X86-SSE2-NEXT: pushl %esi
1643 ; X86-SSE2-NEXT: .cfi_def_cfa_offset 8
1644 ; X86-SSE2-NEXT: .cfi_offset %esi, -8
1645 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1646 ; X86-SSE2-NEXT: xorl %ecx, %ecx
1647 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1648 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1649 ; X86-SSE2-NEXT: xorl %edx, %edx
1650 ; X86-SSE2-NEXT: .p2align 4, 0x90
1651 ; X86-SSE2-NEXT: .LBB8_1: # %loop
1652 ; X86-SSE2-NEXT: # =>This Inner Loop Header: Depth=1
1653 ; X86-SSE2-NEXT: movdqu (%eax,%ecx,4), %xmm1
1654 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1655 ; X86-SSE2-NEXT: psllq %xmm0, %xmm2
1656 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1657 ; X86-SSE2-NEXT: psllq %xmm0, %xmm1
1658 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1659 ; X86-SSE2-NEXT: movups %xmm1, (%eax,%ecx,4)
1660 ; X86-SSE2-NEXT: addl $4, %ecx
1661 ; X86-SSE2-NEXT: adcl $0, %edx
1662 ; X86-SSE2-NEXT: movl %ecx, %esi
1663 ; X86-SSE2-NEXT: xorl $256, %esi # imm = 0x100
1664 ; X86-SSE2-NEXT: orl %edx, %esi
1665 ; X86-SSE2-NEXT: jne .LBB8_1
1666 ; X86-SSE2-NEXT: # %bb.2: # %end
1667 ; X86-SSE2-NEXT: popl %esi
1668 ; X86-SSE2-NEXT: .cfi_def_cfa_offset 4
1669 ; X86-SSE2-NEXT: retl
1671 %ins = insertelement <4 x i32> undef, i32 %shift_amt, i32 0
1672 %splat = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
1676 %index = phi i64 [ 0, %entry ], [ %inc, %loop ]
1677 %addr = getelementptr inbounds i32, ptr %p, i64 %index
1678 %x = load <4 x i32>, ptr %addr, align 4
1679 %fsh = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
1680 store <4 x i32> %fsh, ptr %addr, align 4
1681 %inc = add i64 %index, 4
1682 %iv = icmp eq i64 %inc, 256
1683 br i1 %iv, label %end, label %loop
1693 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1694 ; SSE2-LABEL: constant_funnnel_v2i64:
1696 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1697 ; SSE2-NEXT: psrlq $60, %xmm2
1698 ; SSE2-NEXT: psrlq $50, %xmm1
1699 ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1700 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1701 ; SSE2-NEXT: psllq $4, %xmm1
1702 ; SSE2-NEXT: psllq $14, %xmm0
1703 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1704 ; SSE2-NEXT: orpd %xmm2, %xmm0
1707 ; SSE41-LABEL: constant_funnnel_v2i64:
1709 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1710 ; SSE41-NEXT: psrlq $50, %xmm2
1711 ; SSE41-NEXT: psrlq $60, %xmm1
1712 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1713 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1714 ; SSE41-NEXT: psllq $14, %xmm1
1715 ; SSE41-NEXT: psllq $4, %xmm0
1716 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1717 ; SSE41-NEXT: por %xmm2, %xmm0
1720 ; AVX1-LABEL: constant_funnnel_v2i64:
1722 ; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm2
1723 ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1
1724 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1725 ; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2
1726 ; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0
1727 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1728 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1731 ; AVX2-LABEL: constant_funnnel_v2i64:
1733 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1734 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1735 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1738 ; AVX512F-LABEL: constant_funnnel_v2i64:
1740 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1741 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1742 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1743 ; AVX512F-NEXT: retq
1745 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1746 ; AVX512VL: # %bb.0:
1747 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1748 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1749 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1750 ; AVX512VL-NEXT: retq
1752 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1753 ; AVX512BW: # %bb.0:
1754 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1755 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1756 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1757 ; AVX512BW-NEXT: retq
1759 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1760 ; AVX512VBMI2: # %bb.0:
1761 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1762 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1763 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14]
1764 ; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0
1765 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1766 ; AVX512VBMI2-NEXT: vzeroupper
1767 ; AVX512VBMI2-NEXT: retq
1769 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1770 ; AVX512VLBW: # %bb.0:
1771 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1772 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1773 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1774 ; AVX512VLBW-NEXT: retq
1776 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1777 ; AVX512VLVBMI2: # %bb.0:
1778 ; AVX512VLVBMI2-NEXT: vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1779 ; AVX512VLVBMI2-NEXT: retq
1781 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
1783 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1784 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1786 ; XOPAVX1-NEXT: retq
1788 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
1790 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1791 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1792 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1793 ; XOPAVX2-NEXT: retq
1795 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1796 ; X86-SSE2: # %bb.0:
1797 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1798 ; X86-SSE2-NEXT: psrlq $60, %xmm2
1799 ; X86-SSE2-NEXT: psrlq $50, %xmm1
1800 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1801 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1802 ; X86-SSE2-NEXT: psllq $4, %xmm1
1803 ; X86-SSE2-NEXT: psllq $14, %xmm0
1804 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1805 ; X86-SSE2-NEXT: orpd %xmm2, %xmm0
1806 ; X86-SSE2-NEXT: retl
1807 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1811 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1812 ; SSE2-LABEL: constant_funnnel_v4i32:
1814 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1815 ; SSE2-NEXT: psrld $25, %xmm2
1816 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1817 ; SSE2-NEXT: psrld $26, %xmm3
1818 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1819 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1820 ; SSE2-NEXT: psrld $27, %xmm2
1821 ; SSE2-NEXT: psrld $28, %xmm1
1822 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1823 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1824 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1825 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1826 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1827 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1828 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1829 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1830 ; SSE2-NEXT: por %xmm1, %xmm0
1833 ; SSE41-LABEL: constant_funnnel_v4i32:
1835 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1836 ; SSE41-NEXT: psrld $25, %xmm2
1837 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1838 ; SSE41-NEXT: psrld $27, %xmm3
1839 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1840 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1841 ; SSE41-NEXT: psrld $26, %xmm2
1842 ; SSE41-NEXT: psrld $28, %xmm1
1843 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1844 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1845 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1846 ; SSE41-NEXT: por %xmm2, %xmm0
1849 ; AVX1-LABEL: constant_funnnel_v4i32:
1851 ; AVX1-NEXT: vpsrld $25, %xmm1, %xmm2
1852 ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm3
1853 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1854 ; AVX1-NEXT: vpsrld $26, %xmm1, %xmm3
1855 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
1856 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1857 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1858 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1859 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1862 ; AVX2-LABEL: constant_funnnel_v4i32:
1864 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1865 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1866 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1869 ; AVX512F-LABEL: constant_funnnel_v4i32:
1871 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1872 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1873 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1874 ; AVX512F-NEXT: retq
1876 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1877 ; AVX512VL: # %bb.0:
1878 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1879 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1880 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1881 ; AVX512VL-NEXT: retq
1883 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1884 ; AVX512BW: # %bb.0:
1885 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1886 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1887 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1888 ; AVX512BW-NEXT: retq
1890 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1891 ; AVX512VBMI2: # %bb.0:
1892 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1893 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1894 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
1895 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
1896 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1897 ; AVX512VBMI2-NEXT: vzeroupper
1898 ; AVX512VBMI2-NEXT: retq
1900 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1901 ; AVX512VLBW: # %bb.0:
1902 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1903 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1904 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1905 ; AVX512VLBW-NEXT: retq
1907 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1908 ; AVX512VLVBMI2: # %bb.0:
1909 ; AVX512VLVBMI2-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1910 ; AVX512VLVBMI2-NEXT: retq
1912 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
1914 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1915 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1916 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1917 ; XOPAVX1-NEXT: retq
1919 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
1921 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1922 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1923 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1924 ; XOPAVX2-NEXT: retq
1926 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
1927 ; X86-SSE2: # %bb.0:
1928 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1929 ; X86-SSE2-NEXT: psrld $25, %xmm2
1930 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1931 ; X86-SSE2-NEXT: psrld $26, %xmm3
1932 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1933 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1934 ; X86-SSE2-NEXT: psrld $27, %xmm2
1935 ; X86-SSE2-NEXT: psrld $28, %xmm1
1936 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1937 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1938 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1939 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1940 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1941 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1942 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1943 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1944 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1945 ; X86-SSE2-NEXT: retl
1946 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1950 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1951 ; SSE-LABEL: constant_funnnel_v8i16:
1953 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1954 ; SSE-NEXT: psrlw $1, %xmm1
1955 ; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1956 ; SSE-NEXT: por %xmm1, %xmm0
1959 ; AVX-LABEL: constant_funnnel_v8i16:
1961 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1962 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
1963 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1964 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1967 ; AVX512F-LABEL: constant_funnnel_v8i16:
1969 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1970 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1
1971 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1972 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1973 ; AVX512F-NEXT: retq
1975 ; AVX512VL-LABEL: constant_funnnel_v8i16:
1976 ; AVX512VL: # %bb.0:
1977 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1978 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
1979 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1980 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1981 ; AVX512VL-NEXT: retq
1983 ; AVX512BW-LABEL: constant_funnnel_v8i16:
1984 ; AVX512BW: # %bb.0:
1985 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1986 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1987 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1988 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1989 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
1990 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1991 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1992 ; AVX512BW-NEXT: vzeroupper
1993 ; AVX512BW-NEXT: retq
1995 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1996 ; AVX512VBMI2: # %bb.0:
1997 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1998 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1999 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2000 ; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
2001 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2002 ; AVX512VBMI2-NEXT: vzeroupper
2003 ; AVX512VBMI2-NEXT: retq
2005 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2006 ; AVX512VLBW: # %bb.0:
2007 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2008 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
2009 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2010 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2011 ; AVX512VLBW-NEXT: retq
2013 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2014 ; AVX512VLVBMI2: # %bb.0:
2015 ; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2016 ; AVX512VLVBMI2-NEXT: retq
2018 ; XOP-LABEL: constant_funnnel_v8i16:
2020 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2021 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1
2022 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2023 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2026 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
2027 ; X86-SSE2: # %bb.0:
2028 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2029 ; X86-SSE2-NEXT: psrlw $1, %xmm1
2030 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2031 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2032 ; X86-SSE2-NEXT: retl
2033 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2037 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2038 ; SSE-LABEL: constant_funnnel_v16i8:
2040 ; SSE-NEXT: movdqa %xmm1, %xmm2
2041 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2042 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2043 ; SSE-NEXT: psrlw $8, %xmm2
2044 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2045 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2046 ; SSE-NEXT: psrlw $8, %xmm1
2047 ; SSE-NEXT: packuswb %xmm2, %xmm1
2048 ; SSE-NEXT: movdqa %xmm1, %xmm0
2051 ; AVX-LABEL: constant_funnnel_v16i8:
2053 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2054 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2055 ; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
2056 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2057 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2058 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
2059 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2062 ; AVX512F-LABEL: constant_funnnel_v16i8:
2064 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2065 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2066 ; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
2067 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2068 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2069 ; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0
2070 ; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2071 ; AVX512F-NEXT: retq
2073 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2074 ; AVX512VL: # %bb.0:
2075 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2076 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2077 ; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
2078 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2079 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2080 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
2081 ; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2082 ; AVX512VL-NEXT: retq
2084 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2085 ; AVX512BW: # %bb.0:
2086 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2087 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2088 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2089 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
2090 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2091 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
2092 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
2093 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2094 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2095 ; AVX512BW-NEXT: vzeroupper
2096 ; AVX512BW-NEXT: retq
2098 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2099 ; AVX512VBMI2: # %bb.0:
2100 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2101 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2102 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2103 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1
2104 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2105 ; AVX512VBMI2-NEXT: vpsllvw %zmm0, %zmm1, %zmm0
2106 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
2107 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2108 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2109 ; AVX512VBMI2-NEXT: vzeroupper
2110 ; AVX512VBMI2-NEXT: retq
2112 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2113 ; AVX512VLBW: # %bb.0:
2114 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2115 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2116 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
2117 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2118 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2119 ; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
2120 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2121 ; AVX512VLBW-NEXT: vzeroupper
2122 ; AVX512VLBW-NEXT: retq
2124 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2125 ; AVX512VLVBMI2: # %bb.0:
2126 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2127 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2128 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2129 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
2130 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2131 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0
2132 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2133 ; AVX512VLVBMI2-NEXT: vzeroupper
2134 ; AVX512VLVBMI2-NEXT: retq
2136 ; XOP-LABEL: constant_funnnel_v16i8:
2138 ; XOP-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
2139 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
2140 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2141 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2142 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2145 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
2146 ; X86-SSE2: # %bb.0:
2147 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
2148 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2149 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
2150 ; X86-SSE2-NEXT: psrlw $8, %xmm2
2151 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2152 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2153 ; X86-SSE2-NEXT: psrlw $8, %xmm1
2154 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm1
2155 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
2156 ; X86-SSE2-NEXT: retl
2157 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2162 ; Uniform Constant Shifts
2165 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2166 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2168 ; SSE-NEXT: psrlq $50, %xmm1
2169 ; SSE-NEXT: psllq $14, %xmm0
2170 ; SSE-NEXT: por %xmm1, %xmm0
2173 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2175 ; AVX-NEXT: vpsrlq $50, %xmm1, %xmm1
2176 ; AVX-NEXT: vpsllq $14, %xmm0, %xmm0
2177 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2180 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2182 ; AVX512F-NEXT: vpsrlq $50, %xmm1, %xmm1
2183 ; AVX512F-NEXT: vpsllq $14, %xmm0, %xmm0
2184 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2185 ; AVX512F-NEXT: retq
2187 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2188 ; AVX512VL: # %bb.0:
2189 ; AVX512VL-NEXT: vpsrlq $50, %xmm1, %xmm1
2190 ; AVX512VL-NEXT: vpsllq $14, %xmm0, %xmm0
2191 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2192 ; AVX512VL-NEXT: retq
2194 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2195 ; AVX512BW: # %bb.0:
2196 ; AVX512BW-NEXT: vpsrlq $50, %xmm1, %xmm1
2197 ; AVX512BW-NEXT: vpsllq $14, %xmm0, %xmm0
2198 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2199 ; AVX512BW-NEXT: retq
2201 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2202 ; AVX512VBMI2: # %bb.0:
2203 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2204 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2205 ; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0
2206 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2207 ; AVX512VBMI2-NEXT: vzeroupper
2208 ; AVX512VBMI2-NEXT: retq
2210 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2211 ; AVX512VLBW: # %bb.0:
2212 ; AVX512VLBW-NEXT: vpsrlq $50, %xmm1, %xmm1
2213 ; AVX512VLBW-NEXT: vpsllq $14, %xmm0, %xmm0
2214 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2215 ; AVX512VLBW-NEXT: retq
2217 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2218 ; AVX512VLVBMI2: # %bb.0:
2219 ; AVX512VLVBMI2-NEXT: vpshldq $14, %xmm1, %xmm0, %xmm0
2220 ; AVX512VLVBMI2-NEXT: retq
2222 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2224 ; XOP-NEXT: vpsrlq $50, %xmm1, %xmm1
2225 ; XOP-NEXT: vpsllq $14, %xmm0, %xmm0
2226 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2229 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2230 ; X86-SSE2: # %bb.0:
2231 ; X86-SSE2-NEXT: psrlq $50, %xmm1
2232 ; X86-SSE2-NEXT: psllq $14, %xmm0
2233 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2234 ; X86-SSE2-NEXT: retl
2235 %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2239 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2240 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2242 ; SSE-NEXT: psrld $28, %xmm1
2243 ; SSE-NEXT: pslld $4, %xmm0
2244 ; SSE-NEXT: por %xmm1, %xmm0
2247 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2249 ; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
2250 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0
2251 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2254 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2256 ; AVX512F-NEXT: vpsrld $28, %xmm1, %xmm1
2257 ; AVX512F-NEXT: vpslld $4, %xmm0, %xmm0
2258 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2259 ; AVX512F-NEXT: retq
2261 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2262 ; AVX512VL: # %bb.0:
2263 ; AVX512VL-NEXT: vpsrld $28, %xmm1, %xmm1
2264 ; AVX512VL-NEXT: vpslld $4, %xmm0, %xmm0
2265 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2266 ; AVX512VL-NEXT: retq
2268 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2269 ; AVX512BW: # %bb.0:
2270 ; AVX512BW-NEXT: vpsrld $28, %xmm1, %xmm1
2271 ; AVX512BW-NEXT: vpslld $4, %xmm0, %xmm0
2272 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2273 ; AVX512BW-NEXT: retq
2275 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2276 ; AVX512VBMI2: # %bb.0:
2277 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2278 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2279 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
2280 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2281 ; AVX512VBMI2-NEXT: vzeroupper
2282 ; AVX512VBMI2-NEXT: retq
2284 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2285 ; AVX512VLBW: # %bb.0:
2286 ; AVX512VLBW-NEXT: vpsrld $28, %xmm1, %xmm1
2287 ; AVX512VLBW-NEXT: vpslld $4, %xmm0, %xmm0
2288 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2289 ; AVX512VLBW-NEXT: retq
2291 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2292 ; AVX512VLVBMI2: # %bb.0:
2293 ; AVX512VLVBMI2-NEXT: vpshldd $4, %xmm1, %xmm0, %xmm0
2294 ; AVX512VLVBMI2-NEXT: retq
2296 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2298 ; XOP-NEXT: vpsrld $28, %xmm1, %xmm1
2299 ; XOP-NEXT: vpslld $4, %xmm0, %xmm0
2300 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2303 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2304 ; X86-SSE2: # %bb.0:
2305 ; X86-SSE2-NEXT: psrld $28, %xmm1
2306 ; X86-SSE2-NEXT: pslld $4, %xmm0
2307 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2308 ; X86-SSE2-NEXT: retl
2309 %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2313 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2314 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2316 ; SSE-NEXT: psrlw $9, %xmm1
2317 ; SSE-NEXT: psllw $7, %xmm0
2318 ; SSE-NEXT: por %xmm1, %xmm0
2321 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2323 ; AVX-NEXT: vpsrlw $9, %xmm1, %xmm1
2324 ; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
2325 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2328 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2330 ; AVX512F-NEXT: vpsrlw $9, %xmm1, %xmm1
2331 ; AVX512F-NEXT: vpsllw $7, %xmm0, %xmm0
2332 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2333 ; AVX512F-NEXT: retq
2335 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2336 ; AVX512VL: # %bb.0:
2337 ; AVX512VL-NEXT: vpsrlw $9, %xmm1, %xmm1
2338 ; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
2339 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2340 ; AVX512VL-NEXT: retq
2342 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2343 ; AVX512BW: # %bb.0:
2344 ; AVX512BW-NEXT: vpsrlw $9, %xmm1, %xmm1
2345 ; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
2346 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2347 ; AVX512BW-NEXT: retq
2349 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2350 ; AVX512VBMI2: # %bb.0:
2351 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2352 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2353 ; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0
2354 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2355 ; AVX512VBMI2-NEXT: vzeroupper
2356 ; AVX512VBMI2-NEXT: retq
2358 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2359 ; AVX512VLBW: # %bb.0:
2360 ; AVX512VLBW-NEXT: vpsrlw $9, %xmm1, %xmm1
2361 ; AVX512VLBW-NEXT: vpsllw $7, %xmm0, %xmm0
2362 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2363 ; AVX512VLBW-NEXT: retq
2365 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2366 ; AVX512VLVBMI2: # %bb.0:
2367 ; AVX512VLVBMI2-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm0
2368 ; AVX512VLVBMI2-NEXT: retq
2370 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2372 ; XOP-NEXT: vpsrlw $9, %xmm1, %xmm1
2373 ; XOP-NEXT: vpsllw $7, %xmm0, %xmm0
2374 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2377 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2378 ; X86-SSE2: # %bb.0:
2379 ; X86-SSE2-NEXT: psrlw $9, %xmm1
2380 ; X86-SSE2-NEXT: psllw $7, %xmm0
2381 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2382 ; X86-SSE2-NEXT: retl
2383 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2387 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2388 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2390 ; SSE-NEXT: psrlw $4, %xmm1
2391 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2392 ; SSE-NEXT: psllw $4, %xmm0
2393 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2394 ; SSE-NEXT: por %xmm1, %xmm0
2397 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2399 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2400 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2401 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2402 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2403 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2406 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2408 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2
2409 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0
2410 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2411 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2412 ; AVX512F-NEXT: vzeroupper
2413 ; AVX512F-NEXT: retq
2415 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2416 ; AVX512VL: # %bb.0:
2417 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2418 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2419 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2420 ; AVX512VL-NEXT: retq
2422 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2423 ; AVX512BW: # %bb.0:
2424 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2
2425 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0
2426 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2427 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2428 ; AVX512BW-NEXT: vzeroupper
2429 ; AVX512BW-NEXT: retq
2431 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2432 ; AVX512VBMI2: # %bb.0:
2433 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2434 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2435 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2436 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2437 ; AVX512VBMI2-NEXT: vzeroupper
2438 ; AVX512VBMI2-NEXT: retq
2440 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2441 ; AVX512VLBW: # %bb.0:
2442 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
2443 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
2444 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2445 ; AVX512VLBW-NEXT: retq
2447 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2448 ; AVX512VLVBMI2: # %bb.0:
2449 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2450 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2451 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2452 ; AVX512VLVBMI2-NEXT: retq
2454 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2456 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2457 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2458 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2461 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2462 ; X86-SSE2: # %bb.0:
2463 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2464 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2465 ; X86-SSE2-NEXT: psllw $4, %xmm0
2466 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2467 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2468 ; X86-SSE2-NEXT: retl
2469 %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)