1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
31 ; SSE2-NEXT: movdqa %xmm2, %xmm4
32 ; SSE2-NEXT: pand %xmm3, %xmm4
33 ; SSE2-NEXT: movdqa %xmm1, %xmm5
34 ; SSE2-NEXT: psrlq %xmm4, %xmm5
35 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
36 ; SSE2-NEXT: psrlq %xmm4, %xmm1
37 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
38 ; SSE2-NEXT: pandn %xmm3, %xmm2
39 ; SSE2-NEXT: psllq $1, %xmm0
40 ; SSE2-NEXT: movdqa %xmm0, %xmm3
41 ; SSE2-NEXT: psllq %xmm2, %xmm3
42 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43 ; SSE2-NEXT: psllq %xmm2, %xmm0
44 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
45 ; SSE2-NEXT: orpd %xmm1, %xmm0
48 ; SSE41-LABEL: var_funnnel_v2i64:
50 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
51 ; SSE41-NEXT: movdqa %xmm2, %xmm4
52 ; SSE41-NEXT: pand %xmm3, %xmm4
53 ; SSE41-NEXT: movdqa %xmm1, %xmm5
54 ; SSE41-NEXT: psrlq %xmm4, %xmm5
55 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
56 ; SSE41-NEXT: psrlq %xmm4, %xmm1
57 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
58 ; SSE41-NEXT: pandn %xmm3, %xmm2
59 ; SSE41-NEXT: psllq $1, %xmm0
60 ; SSE41-NEXT: movdqa %xmm0, %xmm3
61 ; SSE41-NEXT: psllq %xmm2, %xmm3
62 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
63 ; SSE41-NEXT: psllq %xmm2, %xmm0
64 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
65 ; SSE41-NEXT: por %xmm1, %xmm0
68 ; AVX1-LABEL: var_funnnel_v2i64:
70 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
71 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
72 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
73 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
74 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
75 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
76 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
77 ; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
78 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
79 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
80 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
81 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
82 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
85 ; AVX2-LABEL: var_funnnel_v2i64:
87 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
88 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
89 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
90 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
91 ; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0
92 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
93 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
96 ; AVX512F-LABEL: var_funnnel_v2i64:
98 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
99 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
100 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
101 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
102 ; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
103 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
104 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
107 ; AVX512VL-LABEL: var_funnnel_v2i64:
109 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
110 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
111 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
112 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
113 ; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
114 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
115 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
116 ; AVX512VL-NEXT: retq
118 ; AVX512BW-LABEL: var_funnnel_v2i64:
120 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
121 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
122 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
123 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
124 ; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
125 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
126 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
127 ; AVX512BW-NEXT: retq
129 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
130 ; AVX512VBMI2: # %bb.0:
131 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
132 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
133 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
134 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
135 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
136 ; AVX512VBMI2-NEXT: vzeroupper
137 ; AVX512VBMI2-NEXT: retq
139 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
140 ; AVX512VLBW: # %bb.0:
141 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
142 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
143 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
144 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
145 ; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
146 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
147 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
148 ; AVX512VLBW-NEXT: retq
150 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
151 ; AVX512VLVBMI2: # %bb.0:
152 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
153 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
154 ; AVX512VLVBMI2-NEXT: retq
156 ; XOPAVX1-LABEL: var_funnnel_v2i64:
158 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
159 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
160 ; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
161 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
162 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
163 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
164 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
165 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
166 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
169 ; XOPAVX2-LABEL: var_funnnel_v2i64:
171 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
172 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
173 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
174 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
175 ; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0
176 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
177 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
180 ; X86-SSE2-LABEL: var_funnnel_v2i64:
182 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
183 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
184 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
185 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
186 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
187 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
188 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
189 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
190 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
191 ; X86-SSE2-NEXT: psllq $1, %xmm0
192 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
193 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3
194 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
195 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
196 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
197 ; X86-SSE2-NEXT: orpd %xmm1, %xmm0
198 ; X86-SSE2-NEXT: retl
199 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
203 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
204 ; SSE2-LABEL: var_funnnel_v4i32:
206 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
207 ; SSE2-NEXT: movdqa %xmm2, %xmm5
208 ; SSE2-NEXT: pand %xmm4, %xmm5
209 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
210 ; SSE2-NEXT: movdqa %xmm1, %xmm6
211 ; SSE2-NEXT: psrld %xmm3, %xmm6
212 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
213 ; SSE2-NEXT: movdqa %xmm1, %xmm3
214 ; SSE2-NEXT: psrld %xmm7, %xmm3
215 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
216 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
217 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
218 ; SSE2-NEXT: movdqa %xmm1, %xmm7
219 ; SSE2-NEXT: psrld %xmm6, %xmm7
220 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
221 ; SSE2-NEXT: psrld %xmm5, %xmm1
222 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
223 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
224 ; SSE2-NEXT: pandn %xmm4, %xmm2
225 ; SSE2-NEXT: pslld $23, %xmm2
226 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
227 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
228 ; SSE2-NEXT: pslld $1, %xmm0
229 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
230 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
231 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
232 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
233 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
234 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
235 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
236 ; SSE2-NEXT: por %xmm3, %xmm0
239 ; SSE41-LABEL: var_funnnel_v4i32:
241 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [31,31,31,31]
242 ; SSE41-NEXT: movdqa %xmm2, %xmm4
243 ; SSE41-NEXT: pand %xmm8, %xmm4
244 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
245 ; SSE41-NEXT: movdqa %xmm1, %xmm6
246 ; SSE41-NEXT: psrld %xmm5, %xmm6
247 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
248 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
249 ; SSE41-NEXT: movdqa %xmm1, %xmm3
250 ; SSE41-NEXT: psrld %xmm7, %xmm3
251 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4,5,6,7]
252 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
253 ; SSE41-NEXT: movdqa %xmm1, %xmm6
254 ; SSE41-NEXT: psrld %xmm4, %xmm6
255 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
256 ; SSE41-NEXT: psrld %xmm4, %xmm1
257 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
258 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
259 ; SSE41-NEXT: pandn %xmm8, %xmm2
260 ; SSE41-NEXT: pslld $23, %xmm2
261 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
262 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
263 ; SSE41-NEXT: pslld $1, %xmm0
264 ; SSE41-NEXT: pmulld %xmm2, %xmm0
265 ; SSE41-NEXT: por %xmm1, %xmm0
268 ; AVX1-LABEL: var_funnnel_v4i32:
270 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
271 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
272 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
273 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
274 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
275 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
276 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
277 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
278 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
279 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
280 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
281 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
282 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
283 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
284 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
285 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
286 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
287 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
288 ; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
289 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
290 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
293 ; AVX2-LABEL: var_funnnel_v4i32:
295 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
296 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
297 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
298 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
299 ; AVX2-NEXT: vpslld $1, %xmm0, %xmm0
300 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
301 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
304 ; AVX512F-LABEL: var_funnnel_v4i32:
306 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
307 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
308 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
309 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
310 ; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0
311 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
312 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
315 ; AVX512VL-LABEL: var_funnnel_v4i32:
317 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
318 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
319 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
320 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
321 ; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0
322 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
323 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
324 ; AVX512VL-NEXT: retq
326 ; AVX512BW-LABEL: var_funnnel_v4i32:
328 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
329 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
330 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
331 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
332 ; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0
333 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
334 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
335 ; AVX512BW-NEXT: retq
337 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
338 ; AVX512VBMI2: # %bb.0:
339 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
340 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
341 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
342 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
343 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
344 ; AVX512VBMI2-NEXT: vzeroupper
345 ; AVX512VBMI2-NEXT: retq
347 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
348 ; AVX512VLBW: # %bb.0:
349 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
350 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
351 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
352 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
353 ; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0
354 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
355 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
356 ; AVX512VLBW-NEXT: retq
358 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
359 ; AVX512VLVBMI2: # %bb.0:
360 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
361 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
362 ; AVX512VLVBMI2-NEXT: retq
364 ; XOPAVX1-LABEL: var_funnnel_v4i32:
366 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
367 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
368 ; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
369 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
370 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
371 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
372 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
373 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
374 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
377 ; XOPAVX2-LABEL: var_funnnel_v4i32:
379 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
380 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
381 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
382 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
383 ; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0
384 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
385 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
388 ; X86-SSE2-LABEL: var_funnnel_v4i32:
390 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
391 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
392 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
393 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
394 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
395 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
396 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
397 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
398 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
399 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
400 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
401 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
402 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
403 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
404 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
405 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
406 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
407 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
408 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
409 ; X86-SSE2-NEXT: pslld $23, %xmm2
410 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
411 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
412 ; X86-SSE2-NEXT: pslld $1, %xmm0
413 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
414 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
415 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
416 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
417 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
418 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
419 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
420 ; X86-SSE2-NEXT: por %xmm3, %xmm0
421 ; X86-SSE2-NEXT: retl
422 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
426 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
427 ; SSE2-LABEL: var_funnnel_v8i16:
429 ; SSE2-NEXT: movdqa %xmm2, %xmm3
430 ; SSE2-NEXT: psllw $12, %xmm3
431 ; SSE2-NEXT: movdqa %xmm3, %xmm4
432 ; SSE2-NEXT: psraw $15, %xmm4
433 ; SSE2-NEXT: movdqa %xmm4, %xmm5
434 ; SSE2-NEXT: pandn %xmm1, %xmm5
435 ; SSE2-NEXT: psrlw $8, %xmm1
436 ; SSE2-NEXT: pand %xmm4, %xmm1
437 ; SSE2-NEXT: por %xmm5, %xmm1
438 ; SSE2-NEXT: paddw %xmm3, %xmm3
439 ; SSE2-NEXT: movdqa %xmm3, %xmm4
440 ; SSE2-NEXT: psraw $15, %xmm4
441 ; SSE2-NEXT: movdqa %xmm4, %xmm5
442 ; SSE2-NEXT: pandn %xmm1, %xmm5
443 ; SSE2-NEXT: psrlw $4, %xmm1
444 ; SSE2-NEXT: pand %xmm4, %xmm1
445 ; SSE2-NEXT: por %xmm5, %xmm1
446 ; SSE2-NEXT: paddw %xmm3, %xmm3
447 ; SSE2-NEXT: movdqa %xmm3, %xmm4
448 ; SSE2-NEXT: psraw $15, %xmm4
449 ; SSE2-NEXT: movdqa %xmm4, %xmm5
450 ; SSE2-NEXT: pandn %xmm1, %xmm5
451 ; SSE2-NEXT: psrlw $2, %xmm1
452 ; SSE2-NEXT: pand %xmm4, %xmm1
453 ; SSE2-NEXT: por %xmm5, %xmm1
454 ; SSE2-NEXT: paddw %xmm3, %xmm3
455 ; SSE2-NEXT: psraw $15, %xmm3
456 ; SSE2-NEXT: movdqa %xmm3, %xmm4
457 ; SSE2-NEXT: pandn %xmm1, %xmm4
458 ; SSE2-NEXT: psrlw $1, %xmm1
459 ; SSE2-NEXT: pand %xmm3, %xmm1
460 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
461 ; SSE2-NEXT: movdqa %xmm2, %xmm3
462 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
463 ; SSE2-NEXT: pslld $23, %xmm3
464 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
465 ; SSE2-NEXT: paddd %xmm5, %xmm3
466 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
467 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
468 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
469 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
470 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
471 ; SSE2-NEXT: pslld $23, %xmm2
472 ; SSE2-NEXT: paddd %xmm5, %xmm2
473 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
474 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
475 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
476 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
477 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
478 ; SSE2-NEXT: psllw $1, %xmm0
479 ; SSE2-NEXT: pmullw %xmm2, %xmm0
480 ; SSE2-NEXT: por %xmm4, %xmm0
481 ; SSE2-NEXT: por %xmm1, %xmm0
484 ; SSE41-LABEL: var_funnnel_v8i16:
486 ; SSE41-NEXT: movdqa %xmm0, %xmm3
487 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
488 ; SSE41-NEXT: movdqa %xmm2, %xmm0
489 ; SSE41-NEXT: pand %xmm5, %xmm0
490 ; SSE41-NEXT: movdqa %xmm0, %xmm4
491 ; SSE41-NEXT: psllw $12, %xmm4
492 ; SSE41-NEXT: psllw $4, %xmm0
493 ; SSE41-NEXT: por %xmm4, %xmm0
494 ; SSE41-NEXT: movdqa %xmm0, %xmm4
495 ; SSE41-NEXT: paddw %xmm0, %xmm4
496 ; SSE41-NEXT: movdqa %xmm1, %xmm6
497 ; SSE41-NEXT: psrlw $8, %xmm6
498 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
499 ; SSE41-NEXT: movdqa %xmm1, %xmm6
500 ; SSE41-NEXT: psrlw $4, %xmm6
501 ; SSE41-NEXT: movdqa %xmm4, %xmm0
502 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
503 ; SSE41-NEXT: movdqa %xmm1, %xmm6
504 ; SSE41-NEXT: psrlw $2, %xmm6
505 ; SSE41-NEXT: paddw %xmm4, %xmm4
506 ; SSE41-NEXT: movdqa %xmm4, %xmm0
507 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
508 ; SSE41-NEXT: movdqa %xmm1, %xmm6
509 ; SSE41-NEXT: psrlw $1, %xmm6
510 ; SSE41-NEXT: paddw %xmm4, %xmm4
511 ; SSE41-NEXT: movdqa %xmm4, %xmm0
512 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
513 ; SSE41-NEXT: pandn %xmm5, %xmm2
514 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
515 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
516 ; SSE41-NEXT: pslld $23, %xmm2
517 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
518 ; SSE41-NEXT: paddd %xmm4, %xmm2
519 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
520 ; SSE41-NEXT: pslld $23, %xmm0
521 ; SSE41-NEXT: paddd %xmm4, %xmm0
522 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
523 ; SSE41-NEXT: packusdw %xmm2, %xmm0
524 ; SSE41-NEXT: psllw $1, %xmm3
525 ; SSE41-NEXT: pmullw %xmm0, %xmm3
526 ; SSE41-NEXT: por %xmm1, %xmm3
527 ; SSE41-NEXT: movdqa %xmm3, %xmm0
530 ; AVX1-LABEL: var_funnnel_v8i16:
532 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
533 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
534 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
535 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
536 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
537 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
538 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
539 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
540 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
541 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
542 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
543 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
544 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
545 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
546 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
547 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
548 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
549 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
550 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
551 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
552 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
553 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
554 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
555 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
556 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
557 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
558 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
559 ; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
560 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
561 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
564 ; AVX2-LABEL: var_funnnel_v8i16:
566 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
567 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
568 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
569 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
570 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
571 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
572 ; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
573 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
574 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
575 ; AVX2-NEXT: vpsllw $1, %xmm0, %xmm0
576 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
577 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
578 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
579 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
580 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
581 ; AVX2-NEXT: vzeroupper
584 ; AVX512F-LABEL: var_funnnel_v8i16:
586 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
587 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
588 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
589 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
590 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
591 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
592 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
593 ; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
594 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
595 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
596 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
597 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
598 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
599 ; AVX512F-NEXT: vzeroupper
602 ; AVX512VL-LABEL: var_funnnel_v8i16:
604 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
605 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
606 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
607 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
608 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
609 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
610 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
611 ; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
612 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
613 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
614 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
615 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
616 ; AVX512VL-NEXT: vzeroupper
617 ; AVX512VL-NEXT: retq
619 ; AVX512BW-LABEL: var_funnnel_v8i16:
621 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
622 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
623 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
624 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
625 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
626 ; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
627 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
628 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
629 ; AVX512BW-NEXT: vzeroupper
630 ; AVX512BW-NEXT: retq
632 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
633 ; AVX512VBMI2: # %bb.0:
634 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
635 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
636 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
637 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
638 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
639 ; AVX512VBMI2-NEXT: vzeroupper
640 ; AVX512VBMI2-NEXT: retq
642 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
643 ; AVX512VLBW: # %bb.0:
644 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
645 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
646 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
647 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
648 ; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
649 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
650 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
651 ; AVX512VLBW-NEXT: retq
653 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
654 ; AVX512VLVBMI2: # %bb.0:
655 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
656 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
657 ; AVX512VLVBMI2-NEXT: retq
659 ; XOP-LABEL: var_funnnel_v8i16:
661 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
662 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
663 ; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
664 ; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0
665 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
666 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
667 ; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
668 ; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1
669 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
672 ; X86-SSE2-LABEL: var_funnnel_v8i16:
674 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
675 ; X86-SSE2-NEXT: psllw $12, %xmm3
676 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
677 ; X86-SSE2-NEXT: psraw $15, %xmm4
678 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
679 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
680 ; X86-SSE2-NEXT: psrlw $8, %xmm1
681 ; X86-SSE2-NEXT: pand %xmm4, %xmm1
682 ; X86-SSE2-NEXT: por %xmm5, %xmm1
683 ; X86-SSE2-NEXT: paddw %xmm3, %xmm3
684 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
685 ; X86-SSE2-NEXT: psraw $15, %xmm4
686 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
687 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
688 ; X86-SSE2-NEXT: psrlw $4, %xmm1
689 ; X86-SSE2-NEXT: pand %xmm4, %xmm1
690 ; X86-SSE2-NEXT: por %xmm5, %xmm1
691 ; X86-SSE2-NEXT: paddw %xmm3, %xmm3
692 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
693 ; X86-SSE2-NEXT: psraw $15, %xmm4
694 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
695 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
696 ; X86-SSE2-NEXT: psrlw $2, %xmm1
697 ; X86-SSE2-NEXT: pand %xmm4, %xmm1
698 ; X86-SSE2-NEXT: por %xmm5, %xmm1
699 ; X86-SSE2-NEXT: paddw %xmm3, %xmm3
700 ; X86-SSE2-NEXT: psraw $15, %xmm3
701 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
702 ; X86-SSE2-NEXT: pandn %xmm1, %xmm4
703 ; X86-SSE2-NEXT: psrlw $1, %xmm1
704 ; X86-SSE2-NEXT: pand %xmm3, %xmm1
705 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
706 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
707 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
708 ; X86-SSE2-NEXT: pslld $23, %xmm3
709 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
710 ; X86-SSE2-NEXT: paddd %xmm5, %xmm3
711 ; X86-SSE2-NEXT: cvttps2dq %xmm3, %xmm3
712 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
713 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
714 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
715 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
716 ; X86-SSE2-NEXT: pslld $23, %xmm2
717 ; X86-SSE2-NEXT: paddd %xmm5, %xmm2
718 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
719 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
720 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
721 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
722 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
723 ; X86-SSE2-NEXT: psllw $1, %xmm0
724 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
725 ; X86-SSE2-NEXT: por %xmm4, %xmm0
726 ; X86-SSE2-NEXT: por %xmm1, %xmm0
727 ; X86-SSE2-NEXT: retl
728 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
732 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
733 ; SSE2-LABEL: var_funnnel_v16i8:
735 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
736 ; SSE2-NEXT: movdqa %xmm2, %xmm4
737 ; SSE2-NEXT: pandn %xmm5, %xmm4
738 ; SSE2-NEXT: psllw $5, %xmm4
739 ; SSE2-NEXT: pxor %xmm3, %xmm3
740 ; SSE2-NEXT: pxor %xmm6, %xmm6
741 ; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
742 ; SSE2-NEXT: paddb %xmm0, %xmm0
743 ; SSE2-NEXT: movdqa %xmm6, %xmm7
744 ; SSE2-NEXT: pandn %xmm0, %xmm7
745 ; SSE2-NEXT: psllw $4, %xmm0
746 ; SSE2-NEXT: pand %xmm6, %xmm0
747 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
748 ; SSE2-NEXT: por %xmm7, %xmm0
749 ; SSE2-NEXT: paddb %xmm4, %xmm4
750 ; SSE2-NEXT: pxor %xmm6, %xmm6
751 ; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
752 ; SSE2-NEXT: movdqa %xmm6, %xmm7
753 ; SSE2-NEXT: pandn %xmm0, %xmm7
754 ; SSE2-NEXT: psllw $2, %xmm0
755 ; SSE2-NEXT: pand %xmm6, %xmm0
756 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
757 ; SSE2-NEXT: por %xmm7, %xmm0
758 ; SSE2-NEXT: paddb %xmm4, %xmm4
759 ; SSE2-NEXT: pxor %xmm6, %xmm6
760 ; SSE2-NEXT: pcmpgtb %xmm4, %xmm6
761 ; SSE2-NEXT: movdqa %xmm6, %xmm4
762 ; SSE2-NEXT: pandn %xmm0, %xmm4
763 ; SSE2-NEXT: paddb %xmm0, %xmm0
764 ; SSE2-NEXT: pand %xmm6, %xmm0
765 ; SSE2-NEXT: pand %xmm5, %xmm2
766 ; SSE2-NEXT: psllw $5, %xmm2
767 ; SSE2-NEXT: pxor %xmm5, %xmm5
768 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm5
769 ; SSE2-NEXT: movdqa %xmm5, %xmm6
770 ; SSE2-NEXT: pandn %xmm1, %xmm6
771 ; SSE2-NEXT: psrlw $4, %xmm1
772 ; SSE2-NEXT: pand %xmm5, %xmm1
773 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
774 ; SSE2-NEXT: por %xmm6, %xmm1
775 ; SSE2-NEXT: paddb %xmm2, %xmm2
776 ; SSE2-NEXT: pxor %xmm5, %xmm5
777 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm5
778 ; SSE2-NEXT: movdqa %xmm5, %xmm6
779 ; SSE2-NEXT: pandn %xmm1, %xmm6
780 ; SSE2-NEXT: psrlw $2, %xmm1
781 ; SSE2-NEXT: pand %xmm5, %xmm1
782 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
783 ; SSE2-NEXT: por %xmm6, %xmm1
784 ; SSE2-NEXT: paddb %xmm2, %xmm2
785 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
786 ; SSE2-NEXT: movdqa %xmm3, %xmm2
787 ; SSE2-NEXT: pandn %xmm1, %xmm2
788 ; SSE2-NEXT: psrlw $1, %xmm1
789 ; SSE2-NEXT: pand %xmm3, %xmm1
790 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
791 ; SSE2-NEXT: por %xmm2, %xmm1
792 ; SSE2-NEXT: por %xmm4, %xmm1
793 ; SSE2-NEXT: por %xmm1, %xmm0
796 ; SSE41-LABEL: var_funnnel_v16i8:
798 ; SSE41-NEXT: movdqa %xmm2, %xmm3
799 ; SSE41-NEXT: movdqa %xmm0, %xmm2
800 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
801 ; SSE41-NEXT: movdqa %xmm3, %xmm0
802 ; SSE41-NEXT: pand %xmm5, %xmm0
803 ; SSE41-NEXT: psllw $5, %xmm0
804 ; SSE41-NEXT: movdqa %xmm0, %xmm4
805 ; SSE41-NEXT: paddb %xmm0, %xmm4
806 ; SSE41-NEXT: movdqa %xmm1, %xmm6
807 ; SSE41-NEXT: psrlw $4, %xmm6
808 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
809 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
810 ; SSE41-NEXT: movdqa %xmm1, %xmm6
811 ; SSE41-NEXT: psrlw $2, %xmm6
812 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
813 ; SSE41-NEXT: movdqa %xmm4, %xmm0
814 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
815 ; SSE41-NEXT: movdqa %xmm1, %xmm6
816 ; SSE41-NEXT: psrlw $1, %xmm6
817 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
818 ; SSE41-NEXT: paddb %xmm4, %xmm4
819 ; SSE41-NEXT: movdqa %xmm4, %xmm0
820 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
821 ; SSE41-NEXT: pandn %xmm5, %xmm3
822 ; SSE41-NEXT: psllw $5, %xmm3
823 ; SSE41-NEXT: movdqa %xmm3, %xmm4
824 ; SSE41-NEXT: paddb %xmm3, %xmm4
825 ; SSE41-NEXT: paddb %xmm2, %xmm2
826 ; SSE41-NEXT: movdqa %xmm2, %xmm5
827 ; SSE41-NEXT: psllw $4, %xmm5
828 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
829 ; SSE41-NEXT: movdqa %xmm3, %xmm0
830 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
831 ; SSE41-NEXT: movdqa %xmm2, %xmm3
832 ; SSE41-NEXT: psllw $2, %xmm3
833 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
834 ; SSE41-NEXT: movdqa %xmm4, %xmm0
835 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
836 ; SSE41-NEXT: movdqa %xmm2, %xmm3
837 ; SSE41-NEXT: paddb %xmm2, %xmm3
838 ; SSE41-NEXT: paddb %xmm4, %xmm4
839 ; SSE41-NEXT: movdqa %xmm4, %xmm0
840 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
841 ; SSE41-NEXT: por %xmm1, %xmm2
842 ; SSE41-NEXT: movdqa %xmm2, %xmm0
845 ; AVX-LABEL: var_funnnel_v16i8:
847 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
848 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
849 ; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
850 ; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
851 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6
852 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
853 ; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
854 ; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4
855 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
856 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
857 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4
858 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
859 ; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
860 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
861 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
862 ; AVX-NEXT: vpsllw $5, %xmm2, %xmm2
863 ; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3
864 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0
865 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm4
866 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
867 ; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
868 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
869 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
870 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
871 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
872 ; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
873 ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
874 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
877 ; AVX512F-LABEL: var_funnnel_v16i8:
879 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
880 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
881 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
882 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
883 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
884 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
885 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
886 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
887 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
888 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
889 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
890 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
891 ; AVX512F-NEXT: vzeroupper
894 ; AVX512VL-LABEL: var_funnnel_v16i8:
896 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
897 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
898 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
899 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
900 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
901 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
902 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
903 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
904 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
905 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
906 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
907 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
908 ; AVX512VL-NEXT: vzeroupper
909 ; AVX512VL-NEXT: retq
911 ; AVX512BW-LABEL: var_funnnel_v16i8:
913 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
914 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
915 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
916 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
917 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
918 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
919 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
920 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
921 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
922 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
923 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
924 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
925 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
926 ; AVX512BW-NEXT: vzeroupper
927 ; AVX512BW-NEXT: retq
929 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
930 ; AVX512VBMI2: # %bb.0:
931 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
932 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
933 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
934 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
935 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
936 ; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
937 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
938 ; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
939 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
940 ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
941 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
942 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
943 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
944 ; AVX512VBMI2-NEXT: vzeroupper
945 ; AVX512VBMI2-NEXT: retq
947 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
948 ; AVX512VLBW: # %bb.0:
949 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
950 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
951 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
952 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
953 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
954 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
955 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
956 ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
957 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
958 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
959 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
960 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
961 ; AVX512VLBW-NEXT: vzeroupper
962 ; AVX512VLBW-NEXT: retq
964 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
965 ; AVX512VLVBMI2: # %bb.0:
966 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
967 ; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
968 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
969 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
970 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
971 ; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
972 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
973 ; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
974 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
975 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
976 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
977 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
978 ; AVX512VLVBMI2-NEXT: vzeroupper
979 ; AVX512VLVBMI2-NEXT: retq
981 ; XOP-LABEL: var_funnnel_v16i8:
983 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
984 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
985 ; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
986 ; XOP-NEXT: vpshlb %xmm4, %xmm0, %xmm0
987 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
988 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
989 ; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm2
990 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
991 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
994 ; X86-SSE2-LABEL: var_funnnel_v16i8:
996 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
997 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
998 ; X86-SSE2-NEXT: pandn %xmm5, %xmm4
999 ; X86-SSE2-NEXT: psllw $5, %xmm4
1000 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
1001 ; X86-SSE2-NEXT: pxor %xmm6, %xmm6
1002 ; X86-SSE2-NEXT: pcmpgtb %xmm4, %xmm6
1003 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1004 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
1005 ; X86-SSE2-NEXT: pandn %xmm0, %xmm7
1006 ; X86-SSE2-NEXT: psllw $4, %xmm0
1007 ; X86-SSE2-NEXT: pand %xmm6, %xmm0
1008 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1009 ; X86-SSE2-NEXT: por %xmm7, %xmm0
1010 ; X86-SSE2-NEXT: paddb %xmm4, %xmm4
1011 ; X86-SSE2-NEXT: pxor %xmm6, %xmm6
1012 ; X86-SSE2-NEXT: pcmpgtb %xmm4, %xmm6
1013 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
1014 ; X86-SSE2-NEXT: pandn %xmm0, %xmm7
1015 ; X86-SSE2-NEXT: psllw $2, %xmm0
1016 ; X86-SSE2-NEXT: pand %xmm6, %xmm0
1017 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1018 ; X86-SSE2-NEXT: por %xmm7, %xmm0
1019 ; X86-SSE2-NEXT: paddb %xmm4, %xmm4
1020 ; X86-SSE2-NEXT: pxor %xmm6, %xmm6
1021 ; X86-SSE2-NEXT: pcmpgtb %xmm4, %xmm6
1022 ; X86-SSE2-NEXT: movdqa %xmm6, %xmm4
1023 ; X86-SSE2-NEXT: pandn %xmm0, %xmm4
1024 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1025 ; X86-SSE2-NEXT: pand %xmm6, %xmm0
1026 ; X86-SSE2-NEXT: pand %xmm5, %xmm2
1027 ; X86-SSE2-NEXT: psllw $5, %xmm2
1028 ; X86-SSE2-NEXT: pxor %xmm5, %xmm5
1029 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm5
1030 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6
1031 ; X86-SSE2-NEXT: pandn %xmm1, %xmm6
1032 ; X86-SSE2-NEXT: psrlw $4, %xmm1
1033 ; X86-SSE2-NEXT: pand %xmm5, %xmm1
1034 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1035 ; X86-SSE2-NEXT: por %xmm6, %xmm1
1036 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1037 ; X86-SSE2-NEXT: pxor %xmm5, %xmm5
1038 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm5
1039 ; X86-SSE2-NEXT: movdqa %xmm5, %xmm6
1040 ; X86-SSE2-NEXT: pandn %xmm1, %xmm6
1041 ; X86-SSE2-NEXT: psrlw $2, %xmm1
1042 ; X86-SSE2-NEXT: pand %xmm5, %xmm1
1043 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1044 ; X86-SSE2-NEXT: por %xmm6, %xmm1
1045 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1046 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
1047 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
1048 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1049 ; X86-SSE2-NEXT: psrlw $1, %xmm1
1050 ; X86-SSE2-NEXT: pand %xmm3, %xmm1
1051 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1052 ; X86-SSE2-NEXT: por %xmm2, %xmm1
1053 ; X86-SSE2-NEXT: por %xmm4, %xmm1
1054 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1055 ; X86-SSE2-NEXT: retl
1056 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1061 ; Uniform Variable Shifts
1064 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1065 ; SSE-LABEL: splatvar_funnnel_v2i64:
1067 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
1068 ; SSE-NEXT: movdqa %xmm2, %xmm4
1069 ; SSE-NEXT: pand %xmm3, %xmm4
1070 ; SSE-NEXT: psrlq %xmm4, %xmm1
1071 ; SSE-NEXT: pandn %xmm3, %xmm2
1072 ; SSE-NEXT: psllq $1, %xmm0
1073 ; SSE-NEXT: psllq %xmm2, %xmm0
1074 ; SSE-NEXT: por %xmm1, %xmm0
1077 ; AVX-LABEL: splatvar_funnnel_v2i64:
1079 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1080 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1081 ; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1082 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1083 ; AVX-NEXT: vpsllq $1, %xmm0, %xmm0
1084 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1085 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1088 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1090 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1091 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1092 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1093 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1094 ; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
1095 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1096 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1097 ; AVX512F-NEXT: retq
1099 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1100 ; AVX512VL: # %bb.0:
1101 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1102 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1103 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1104 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1105 ; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
1106 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1107 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1108 ; AVX512VL-NEXT: retq
1110 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1111 ; AVX512BW: # %bb.0:
1112 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1113 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1114 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1115 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1116 ; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
1117 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1118 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1119 ; AVX512BW-NEXT: retq
1121 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1122 ; AVX512VBMI2: # %bb.0:
1123 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1124 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1125 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1126 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1127 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1128 ; AVX512VBMI2-NEXT: vzeroupper
1129 ; AVX512VBMI2-NEXT: retq
1131 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1132 ; AVX512VLBW: # %bb.0:
1133 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1134 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1135 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1136 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1137 ; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
1138 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1139 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1140 ; AVX512VLBW-NEXT: retq
1142 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1143 ; AVX512VLVBMI2: # %bb.0:
1144 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1145 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1146 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1147 ; AVX512VLVBMI2-NEXT: retq
1149 ; XOP-LABEL: splatvar_funnnel_v2i64:
1151 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1152 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1153 ; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1154 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1155 ; XOP-NEXT: vpsllq $1, %xmm0, %xmm0
1156 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1157 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1160 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1161 ; X86-SSE2: # %bb.0:
1162 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1163 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
1164 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1165 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1166 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1167 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
1168 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1169 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1170 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
1171 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1172 ; X86-SSE2-NEXT: psllq $1, %xmm0
1173 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
1174 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3
1175 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1176 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1177 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
1178 ; X86-SSE2-NEXT: orpd %xmm1, %xmm0
1179 ; X86-SSE2-NEXT: retl
1180 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1181 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1185 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1186 ; SSE2-LABEL: splatvar_funnnel_v4i32:
1188 ; SSE2-NEXT: movd %xmm2, %eax
1189 ; SSE2-NEXT: movl %eax, %ecx
1190 ; SSE2-NEXT: andl $31, %ecx
1191 ; SSE2-NEXT: movd %ecx, %xmm2
1192 ; SSE2-NEXT: psrld %xmm2, %xmm1
1193 ; SSE2-NEXT: pslld $1, %xmm0
1194 ; SSE2-NEXT: notl %eax
1195 ; SSE2-NEXT: andl $31, %eax
1196 ; SSE2-NEXT: movd %eax, %xmm2
1197 ; SSE2-NEXT: pslld %xmm2, %xmm0
1198 ; SSE2-NEXT: por %xmm1, %xmm0
1201 ; SSE41-LABEL: splatvar_funnnel_v4i32:
1203 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
1204 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1205 ; SSE41-NEXT: pand %xmm3, %xmm4
1206 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1207 ; SSE41-NEXT: psrld %xmm4, %xmm1
1208 ; SSE41-NEXT: pandn %xmm3, %xmm2
1209 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1210 ; SSE41-NEXT: pslld $1, %xmm0
1211 ; SSE41-NEXT: pslld %xmm2, %xmm0
1212 ; SSE41-NEXT: por %xmm1, %xmm0
1215 ; AVX1-LABEL: splatvar_funnnel_v4i32:
1217 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
1218 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1219 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1220 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1221 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1222 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1223 ; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
1224 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
1225 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1228 ; AVX2-LABEL: splatvar_funnnel_v4i32:
1230 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1231 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1232 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1233 ; AVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1234 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1235 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1236 ; AVX2-NEXT: vpslld $1, %xmm0, %xmm0
1237 ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
1238 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1241 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1243 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1244 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1245 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1246 ; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1247 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1248 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1249 ; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0
1250 ; AVX512F-NEXT: vpslld %xmm2, %xmm0, %xmm0
1251 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1252 ; AVX512F-NEXT: retq
1254 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1255 ; AVX512VL: # %bb.0:
1256 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1257 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1258 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1259 ; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1260 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1261 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1262 ; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0
1263 ; AVX512VL-NEXT: vpslld %xmm2, %xmm0, %xmm0
1264 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1265 ; AVX512VL-NEXT: retq
1267 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1268 ; AVX512BW: # %bb.0:
1269 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1270 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1271 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1272 ; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1273 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1274 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1275 ; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0
1276 ; AVX512BW-NEXT: vpslld %xmm2, %xmm0, %xmm0
1277 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1278 ; AVX512BW-NEXT: retq
1280 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1281 ; AVX512VBMI2: # %bb.0:
1282 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1283 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1284 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1285 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1286 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1287 ; AVX512VBMI2-NEXT: vzeroupper
1288 ; AVX512VBMI2-NEXT: retq
1290 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1291 ; AVX512VLBW: # %bb.0:
1292 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1293 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1294 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1295 ; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1296 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1297 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1298 ; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0
1299 ; AVX512VLBW-NEXT: vpslld %xmm2, %xmm0, %xmm0
1300 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1301 ; AVX512VLBW-NEXT: retq
1303 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1304 ; AVX512VLVBMI2: # %bb.0:
1305 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1306 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1307 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1308 ; AVX512VLVBMI2-NEXT: retq
1310 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1312 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
1313 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1314 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1315 ; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1316 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1317 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1318 ; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
1319 ; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0
1320 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1321 ; XOPAVX1-NEXT: retq
1323 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1325 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1326 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1327 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1328 ; XOPAVX2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
1329 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1330 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1331 ; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0
1332 ; XOPAVX2-NEXT: vpslld %xmm2, %xmm0, %xmm0
1333 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1334 ; XOPAVX2-NEXT: retq
1336 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1337 ; X86-SSE2: # %bb.0:
1338 ; X86-SSE2-NEXT: movd %xmm2, %eax
1339 ; X86-SSE2-NEXT: movl %eax, %ecx
1340 ; X86-SSE2-NEXT: andl $31, %ecx
1341 ; X86-SSE2-NEXT: movd %ecx, %xmm2
1342 ; X86-SSE2-NEXT: psrld %xmm2, %xmm1
1343 ; X86-SSE2-NEXT: pslld $1, %xmm0
1344 ; X86-SSE2-NEXT: notl %eax
1345 ; X86-SSE2-NEXT: andl $31, %eax
1346 ; X86-SSE2-NEXT: movd %eax, %xmm2
1347 ; X86-SSE2-NEXT: pslld %xmm2, %xmm0
1348 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1349 ; X86-SSE2-NEXT: retl
1350 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1351 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1355 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1356 ; SSE2-LABEL: splatvar_funnnel_v8i16:
1358 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1359 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1360 ; SSE2-NEXT: pand %xmm3, %xmm4
1361 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1362 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1363 ; SSE2-NEXT: psrlw %xmm4, %xmm1
1364 ; SSE2-NEXT: pandn %xmm3, %xmm2
1365 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
1366 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1367 ; SSE2-NEXT: psllw $1, %xmm0
1368 ; SSE2-NEXT: psllw %xmm2, %xmm0
1369 ; SSE2-NEXT: por %xmm1, %xmm0
1372 ; SSE41-LABEL: splatvar_funnnel_v8i16:
1374 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1375 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1376 ; SSE41-NEXT: pand %xmm3, %xmm4
1377 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1378 ; SSE41-NEXT: psrlw %xmm4, %xmm1
1379 ; SSE41-NEXT: pandn %xmm3, %xmm2
1380 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1381 ; SSE41-NEXT: psllw $1, %xmm0
1382 ; SSE41-NEXT: psllw %xmm2, %xmm0
1383 ; SSE41-NEXT: por %xmm1, %xmm0
1386 ; AVX-LABEL: splatvar_funnnel_v8i16:
1388 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1389 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1390 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1391 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1392 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1393 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1394 ; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
1395 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1396 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1399 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1401 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1402 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1403 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1404 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1405 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1406 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1407 ; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
1408 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1409 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1410 ; AVX512F-NEXT: retq
1412 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1413 ; AVX512VL: # %bb.0:
1414 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1415 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1416 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1417 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1418 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1419 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1420 ; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
1421 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1422 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1423 ; AVX512VL-NEXT: retq
1425 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1426 ; AVX512BW: # %bb.0:
1427 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1428 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1429 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1430 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1431 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1432 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1433 ; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
1434 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1435 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1436 ; AVX512BW-NEXT: retq
1438 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1439 ; AVX512VBMI2: # %bb.0:
1440 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1441 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1442 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1443 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1444 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1445 ; AVX512VBMI2-NEXT: vzeroupper
1446 ; AVX512VBMI2-NEXT: retq
1448 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1449 ; AVX512VLBW: # %bb.0:
1450 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1451 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1452 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1453 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1454 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1455 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1456 ; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
1457 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1458 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1459 ; AVX512VLBW-NEXT: retq
1461 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1462 ; AVX512VLVBMI2: # %bb.0:
1463 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1464 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1465 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1466 ; AVX512VLVBMI2-NEXT: retq
1468 ; XOP-LABEL: splatvar_funnnel_v8i16:
1470 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1471 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1472 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1473 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1474 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1475 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1476 ; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
1477 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1478 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1481 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1482 ; X86-SSE2: # %bb.0:
1483 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1484 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1485 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1486 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1487 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1488 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
1489 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1490 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
1491 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1492 ; X86-SSE2-NEXT: psllw $1, %xmm0
1493 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1494 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1495 ; X86-SSE2-NEXT: retl
1496 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1497 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1501 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1502 ; SSE2-LABEL: splatvar_funnnel_v16i8:
1504 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1505 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1506 ; SSE2-NEXT: pandn %xmm3, %xmm4
1507 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1508 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1509 ; SSE2-NEXT: paddb %xmm0, %xmm0
1510 ; SSE2-NEXT: psllw %xmm4, %xmm0
1511 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
1512 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1513 ; SSE2-NEXT: psllw %xmm4, %xmm6
1514 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1515 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7]
1516 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1517 ; SSE2-NEXT: pand %xmm4, %xmm0
1518 ; SSE2-NEXT: pand %xmm3, %xmm2
1519 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1520 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1521 ; SSE2-NEXT: psrlw %xmm2, %xmm1
1522 ; SSE2-NEXT: psrlw %xmm2, %xmm5
1523 ; SSE2-NEXT: psrlw $8, %xmm5
1524 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1525 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
1526 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1527 ; SSE2-NEXT: pand %xmm1, %xmm2
1528 ; SSE2-NEXT: por %xmm2, %xmm0
1531 ; SSE41-LABEL: splatvar_funnnel_v16i8:
1533 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1534 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1535 ; SSE41-NEXT: pand %xmm3, %xmm4
1536 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1537 ; SSE41-NEXT: psrlw %xmm4, %xmm1
1538 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
1539 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1540 ; SSE41-NEXT: psrlw %xmm4, %xmm6
1541 ; SSE41-NEXT: pshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1542 ; SSE41-NEXT: pand %xmm1, %xmm6
1543 ; SSE41-NEXT: pandn %xmm3, %xmm2
1544 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1545 ; SSE41-NEXT: paddb %xmm0, %xmm0
1546 ; SSE41-NEXT: psllw %xmm1, %xmm0
1547 ; SSE41-NEXT: psllw %xmm1, %xmm5
1548 ; SSE41-NEXT: pxor %xmm1, %xmm1
1549 ; SSE41-NEXT: pshufb %xmm1, %xmm5
1550 ; SSE41-NEXT: pand %xmm5, %xmm0
1551 ; SSE41-NEXT: por %xmm6, %xmm0
1554 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1556 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1557 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1558 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1559 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1560 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1561 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
1562 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1563 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1564 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1565 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1566 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1567 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1568 ; AVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm2
1569 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1570 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1571 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1572 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1575 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1577 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1578 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1579 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1580 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1581 ; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1582 ; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1583 ; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1584 ; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
1585 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1586 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1587 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1588 ; AVX2-NEXT: vpsrlw %xmm2, %xmm1, %xmm1
1589 ; AVX2-NEXT: vpsrlw %xmm2, %xmm5, %xmm2
1590 ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
1591 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1592 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
1593 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1596 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1598 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1599 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1600 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1601 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1602 ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
1603 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1604 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1605 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1606 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1607 ; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0
1608 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1609 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1610 ; AVX512F-NEXT: vzeroupper
1611 ; AVX512F-NEXT: retq
1613 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1614 ; AVX512VL: # %bb.0:
1615 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1616 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1617 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1618 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1619 ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
1620 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1621 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1622 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1623 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1624 ; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0
1625 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1626 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1627 ; AVX512VL-NEXT: vzeroupper
1628 ; AVX512VL-NEXT: retq
1630 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1631 ; AVX512BW: # %bb.0:
1632 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1633 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1634 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1635 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1636 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1637 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1638 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1639 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1640 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1641 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1642 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1643 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1644 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1645 ; AVX512BW-NEXT: vzeroupper
1646 ; AVX512BW-NEXT: retq
1648 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1649 ; AVX512VBMI2: # %bb.0:
1650 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1651 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1652 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1653 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1654 ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1655 ; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1656 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1657 ; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1658 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1659 ; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1660 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
1661 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1662 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1663 ; AVX512VBMI2-NEXT: vzeroupper
1664 ; AVX512VBMI2-NEXT: retq
1666 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1667 ; AVX512VLBW: # %bb.0:
1668 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1669 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1670 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1671 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1672 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1673 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1674 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1675 ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1676 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1677 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1678 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1679 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1680 ; AVX512VLBW-NEXT: vzeroupper
1681 ; AVX512VLBW-NEXT: retq
1683 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1684 ; AVX512VLVBMI2: # %bb.0:
1685 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1686 ; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1687 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1688 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1689 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
1690 ; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1691 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1692 ; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1693 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1694 ; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
1695 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
1696 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1697 ; AVX512VLVBMI2-NEXT: vzeroupper
1698 ; AVX512VLVBMI2-NEXT: retq
1700 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
1702 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1703 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1704 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1705 ; XOPAVX1-NEXT: vpandn %xmm4, %xmm2, %xmm5
1706 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1707 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1708 ; XOPAVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1709 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
1710 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1711 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1712 ; XOPAVX1-NEXT: retq
1714 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
1716 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1717 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1718 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1719 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1720 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
1721 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1722 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1723 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
1724 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1725 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1726 ; XOPAVX2-NEXT: retq
1728 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1729 ; X86-SSE2: # %bb.0:
1730 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1731 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1732 ; X86-SSE2-NEXT: pandn %xmm3, %xmm4
1733 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1734 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1735 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1736 ; X86-SSE2-NEXT: psllw %xmm4, %xmm0
1737 ; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5
1738 ; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1739 ; X86-SSE2-NEXT: psllw %xmm4, %xmm6
1740 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1741 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,0,0,0,4,5,6,7]
1742 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1743 ; X86-SSE2-NEXT: pand %xmm4, %xmm0
1744 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
1745 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1746 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1747 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1
1748 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm5
1749 ; X86-SSE2-NEXT: psrlw $8, %xmm5
1750 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1751 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
1752 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1753 ; X86-SSE2-NEXT: pand %xmm1, %xmm2
1754 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1755 ; X86-SSE2-NEXT: retl
1756 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1757 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1765 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1766 ; SSE2-LABEL: constant_funnnel_v2i64:
1768 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1769 ; SSE2-NEXT: psrlq $4, %xmm2
1770 ; SSE2-NEXT: psrlq $14, %xmm1
1771 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1772 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1773 ; SSE2-NEXT: psllq $60, %xmm2
1774 ; SSE2-NEXT: psllq $50, %xmm0
1775 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1776 ; SSE2-NEXT: orpd %xmm1, %xmm0
1779 ; SSE41-LABEL: constant_funnnel_v2i64:
1781 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1782 ; SSE41-NEXT: psrlq $14, %xmm2
1783 ; SSE41-NEXT: psrlq $4, %xmm1
1784 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1785 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1786 ; SSE41-NEXT: psllq $50, %xmm2
1787 ; SSE41-NEXT: psllq $60, %xmm0
1788 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1789 ; SSE41-NEXT: por %xmm1, %xmm0
1792 ; AVX1-LABEL: constant_funnnel_v2i64:
1794 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
1795 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
1796 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1797 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
1798 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
1799 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1800 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1803 ; AVX2-LABEL: constant_funnnel_v2i64:
1805 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1806 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1807 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1810 ; AVX512F-LABEL: constant_funnnel_v2i64:
1812 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1813 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1814 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1815 ; AVX512F-NEXT: retq
1817 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1818 ; AVX512VL: # %bb.0:
1819 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1820 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1821 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1822 ; AVX512VL-NEXT: retq
1824 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1825 ; AVX512BW: # %bb.0:
1826 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1827 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1828 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1829 ; AVX512BW-NEXT: retq
1831 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1832 ; AVX512VBMI2: # %bb.0:
1833 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1834 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1835 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14]
1836 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1837 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1838 ; AVX512VBMI2-NEXT: vzeroupper
1839 ; AVX512VBMI2-NEXT: retq
1841 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1842 ; AVX512VLBW: # %bb.0:
1843 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1844 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1845 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1846 ; AVX512VLBW-NEXT: retq
1848 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1849 ; AVX512VLVBMI2: # %bb.0:
1850 ; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1851 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1852 ; AVX512VLVBMI2-NEXT: retq
1854 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
1856 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1857 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1858 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1859 ; XOPAVX1-NEXT: retq
1861 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
1863 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1864 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1865 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1866 ; XOPAVX2-NEXT: retq
1868 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1869 ; X86-SSE2: # %bb.0:
1870 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
1871 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u>
1872 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
1873 ; X86-SSE2-NEXT: pand %xmm2, %xmm4
1874 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1875 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5
1876 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1877 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1878 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
1879 ; X86-SSE2-NEXT: pandn %xmm2, %xmm3
1880 ; X86-SSE2-NEXT: psllq $1, %xmm0
1881 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1882 ; X86-SSE2-NEXT: psllq %xmm3, %xmm2
1883 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1884 ; X86-SSE2-NEXT: psllq %xmm3, %xmm0
1885 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1886 ; X86-SSE2-NEXT: orpd %xmm1, %xmm0
1887 ; X86-SSE2-NEXT: retl
1888 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1892 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1893 ; SSE2-LABEL: constant_funnnel_v4i32:
1895 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1896 ; SSE2-NEXT: psrld $7, %xmm2
1897 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1898 ; SSE2-NEXT: psrld $6, %xmm3
1899 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1900 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1901 ; SSE2-NEXT: psrld $5, %xmm2
1902 ; SSE2-NEXT: psrld $4, %xmm1
1903 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1904 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1905 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
1906 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1907 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
1908 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1909 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1910 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
1911 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1912 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1913 ; SSE2-NEXT: por %xmm1, %xmm0
1916 ; SSE41-LABEL: constant_funnnel_v4i32:
1918 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1919 ; SSE41-NEXT: psrld $7, %xmm2
1920 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1921 ; SSE41-NEXT: psrld $5, %xmm3
1922 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1923 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1924 ; SSE41-NEXT: psrld $6, %xmm2
1925 ; SSE41-NEXT: psrld $4, %xmm1
1926 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1927 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1928 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1929 ; SSE41-NEXT: por %xmm1, %xmm0
1932 ; AVX1-LABEL: constant_funnnel_v4i32:
1934 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
1935 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
1936 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1937 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
1938 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
1939 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1940 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1941 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1942 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1945 ; AVX2-LABEL: constant_funnnel_v4i32:
1947 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1948 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1949 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1952 ; AVX512F-LABEL: constant_funnnel_v4i32:
1954 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1955 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1956 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1957 ; AVX512F-NEXT: retq
1959 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1960 ; AVX512VL: # %bb.0:
1961 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1962 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1963 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1964 ; AVX512VL-NEXT: retq
1966 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1967 ; AVX512BW: # %bb.0:
1968 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1969 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1970 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1971 ; AVX512BW-NEXT: retq
1973 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1974 ; AVX512VBMI2: # %bb.0:
1975 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1976 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1977 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
1978 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1979 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1980 ; AVX512VBMI2-NEXT: vzeroupper
1981 ; AVX512VBMI2-NEXT: retq
1983 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1984 ; AVX512VLBW: # %bb.0:
1985 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1986 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1987 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1988 ; AVX512VLBW-NEXT: retq
1990 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1991 ; AVX512VLVBMI2: # %bb.0:
1992 ; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1993 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1994 ; AVX512VLVBMI2-NEXT: retq
1996 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
1998 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1999 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2000 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2001 ; XOPAVX1-NEXT: retq
2003 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
2005 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2006 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2007 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2008 ; XOPAVX2-NEXT: retq
2010 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
2011 ; X86-SSE2: # %bb.0:
2012 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
2013 ; X86-SSE2-NEXT: psrld $7, %xmm2
2014 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
2015 ; X86-SSE2-NEXT: psrld $6, %xmm3
2016 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2017 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
2018 ; X86-SSE2-NEXT: psrld $5, %xmm2
2019 ; X86-SSE2-NEXT: psrld $4, %xmm1
2020 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2021 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2022 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2023 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2024 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0
2025 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2026 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2027 ; X86-SSE2-NEXT: pmuludq %xmm3, %xmm2
2028 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2029 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2030 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2031 ; X86-SSE2-NEXT: retl
2032 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2036 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2037 ; SSE2-LABEL: constant_funnnel_v8i16:
2039 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2040 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2041 ; SSE2-NEXT: pandn %xmm1, %xmm3
2042 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2043 ; SSE2-NEXT: pand %xmm1, %xmm2
2044 ; SSE2-NEXT: psllw $1, %xmm0
2045 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2046 ; SSE2-NEXT: por %xmm3, %xmm0
2047 ; SSE2-NEXT: por %xmm2, %xmm0
2050 ; SSE41-LABEL: constant_funnnel_v8i16:
2052 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2053 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
2054 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
2055 ; SSE41-NEXT: psllw $1, %xmm0
2056 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2057 ; SSE41-NEXT: por %xmm2, %xmm0
2060 ; AVX-LABEL: constant_funnnel_v8i16:
2062 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2063 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
2064 ; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
2065 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2066 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2069 ; AVX512F-LABEL: constant_funnnel_v8i16:
2071 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2072 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
2073 ; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
2074 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2075 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2076 ; AVX512F-NEXT: retq
2078 ; AVX512VL-LABEL: constant_funnnel_v8i16:
2079 ; AVX512VL: # %bb.0:
2080 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2081 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
2082 ; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
2083 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2084 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2085 ; AVX512VL-NEXT: retq
2087 ; AVX512BW-LABEL: constant_funnnel_v8i16:
2088 ; AVX512BW: # %bb.0:
2089 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2090 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2091 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
2092 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
2093 ; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
2094 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
2095 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2096 ; AVX512BW-NEXT: vzeroupper
2097 ; AVX512BW-NEXT: retq
2099 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
2100 ; AVX512VBMI2: # %bb.0:
2101 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2102 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2103 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2104 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
2105 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
2106 ; AVX512VBMI2-NEXT: vzeroupper
2107 ; AVX512VBMI2-NEXT: retq
2109 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2110 ; AVX512VLBW: # %bb.0:
2111 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2112 ; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
2113 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2114 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2115 ; AVX512VLBW-NEXT: retq
2117 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2118 ; AVX512VLVBMI2: # %bb.0:
2119 ; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2120 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2121 ; AVX512VLVBMI2-NEXT: retq
2123 ; XOP-LABEL: constant_funnnel_v8i16:
2125 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2126 ; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
2127 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2128 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2131 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
2132 ; X86-SSE2: # %bb.0:
2133 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2134 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
2135 ; X86-SSE2-NEXT: pandn %xmm1, %xmm3
2136 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2137 ; X86-SSE2-NEXT: pand %xmm1, %xmm2
2138 ; X86-SSE2-NEXT: psllw $1, %xmm0
2139 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2140 ; X86-SSE2-NEXT: por %xmm3, %xmm0
2141 ; X86-SSE2-NEXT: por %xmm2, %xmm0
2142 ; X86-SSE2-NEXT: retl
2143 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2147 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2148 ; SSE2-LABEL: constant_funnnel_v16i8:
2150 ; SSE2-NEXT: pxor %xmm2, %xmm2
2151 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2152 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2153 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2154 ; SSE2-NEXT: psrlw $8, %xmm3
2155 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2156 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2157 ; SSE2-NEXT: psrlw $8, %xmm1
2158 ; SSE2-NEXT: packuswb %xmm3, %xmm1
2159 ; SSE2-NEXT: paddb %xmm0, %xmm0
2160 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2161 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2162 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2163 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2164 ; SSE2-NEXT: pand %xmm3, %xmm2
2165 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2166 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2167 ; SSE2-NEXT: pand %xmm3, %xmm0
2168 ; SSE2-NEXT: packuswb %xmm2, %xmm0
2169 ; SSE2-NEXT: por %xmm1, %xmm0
2172 ; SSE41-LABEL: constant_funnnel_v16i8:
2174 ; SSE41-NEXT: paddb %xmm0, %xmm0
2175 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2176 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2177 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2178 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2179 ; SSE41-NEXT: pand %xmm3, %xmm0
2180 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2181 ; SSE41-NEXT: pand %xmm3, %xmm2
2182 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2183 ; SSE41-NEXT: pxor %xmm3, %xmm3
2184 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2185 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
2186 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2187 ; SSE41-NEXT: psrlw $8, %xmm1
2188 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2189 ; SSE41-NEXT: psrlw $8, %xmm0
2190 ; SSE41-NEXT: packuswb %xmm1, %xmm0
2191 ; SSE41-NEXT: por %xmm2, %xmm0
2194 ; AVX1-LABEL: constant_funnnel_v16i8:
2196 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2197 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2198 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2199 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2200 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2201 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2202 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2203 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2204 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2205 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2206 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2207 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2208 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2209 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2210 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2211 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2212 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2213 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2216 ; AVX2-LABEL: constant_funnnel_v16i8:
2218 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2219 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2220 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2221 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2222 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2223 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2224 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2225 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2226 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2227 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2228 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2229 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2230 ; AVX2-NEXT: vzeroupper
2233 ; AVX512F-LABEL: constant_funnnel_v16i8:
2235 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2236 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2237 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2238 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2239 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2240 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
2241 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2242 ; AVX512F-NEXT: vzeroupper
2243 ; AVX512F-NEXT: retq
2245 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2246 ; AVX512VL: # %bb.0:
2247 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2248 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2249 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2250 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2251 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2252 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
2253 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2254 ; AVX512VL-NEXT: vzeroupper
2255 ; AVX512VL-NEXT: retq
2257 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2258 ; AVX512BW: # %bb.0:
2259 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2260 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2261 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
2262 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
2263 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2264 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2265 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
2266 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2267 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2268 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2269 ; AVX512BW-NEXT: vzeroupper
2270 ; AVX512BW-NEXT: retq
2272 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2273 ; AVX512VBMI2: # %bb.0:
2274 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2275 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2276 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
2277 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,7,0,1,2,3,4,5,6]
2278 ; AVX512VBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2279 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2280 ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
2281 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
2282 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2283 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2284 ; AVX512VBMI2-NEXT: vzeroupper
2285 ; AVX512VBMI2-NEXT: retq
2287 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2288 ; AVX512VLBW: # %bb.0:
2289 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2290 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2291 ; AVX512VLBW-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2292 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2293 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2294 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2295 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2296 ; AVX512VLBW-NEXT: vzeroupper
2297 ; AVX512VLBW-NEXT: retq
2299 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2300 ; AVX512VLVBMI2: # %bb.0:
2301 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2302 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2303 ; AVX512VLVBMI2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2304 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2305 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2306 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
2307 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2308 ; AVX512VLVBMI2-NEXT: vzeroupper
2309 ; AVX512VLVBMI2-NEXT: retq
2311 ; XOP-LABEL: constant_funnnel_v16i8:
2313 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2314 ; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2315 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2316 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2319 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
2320 ; X86-SSE2: # %bb.0:
2321 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
2322 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
2323 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2324 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
2325 ; X86-SSE2-NEXT: psrlw $8, %xmm3
2326 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2327 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2328 ; X86-SSE2-NEXT: psrlw $8, %xmm1
2329 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
2330 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
2331 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
2332 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2333 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
2334 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2335 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
2336 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2337 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2338 ; X86-SSE2-NEXT: pand %xmm3, %xmm0
2339 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
2340 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2341 ; X86-SSE2-NEXT: retl
2342 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2347 ; Uniform Constant Shifts
2350 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2351 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2353 ; SSE-NEXT: psrlq $14, %xmm1
2354 ; SSE-NEXT: psllq $50, %xmm0
2355 ; SSE-NEXT: por %xmm1, %xmm0
2358 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2360 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2361 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2362 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2365 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2367 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2368 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2369 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2370 ; AVX512F-NEXT: retq
2372 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2373 ; AVX512VL: # %bb.0:
2374 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2375 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2376 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2377 ; AVX512VL-NEXT: retq
2379 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2380 ; AVX512BW: # %bb.0:
2381 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2382 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2383 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2384 ; AVX512BW-NEXT: retq
2386 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2387 ; AVX512VBMI2: # %bb.0:
2388 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2389 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2390 ; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
2391 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2392 ; AVX512VBMI2-NEXT: vzeroupper
2393 ; AVX512VBMI2-NEXT: retq
2395 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2396 ; AVX512VLBW: # %bb.0:
2397 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2398 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2399 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2400 ; AVX512VLBW-NEXT: retq
2402 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2403 ; AVX512VLVBMI2: # %bb.0:
2404 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2405 ; AVX512VLVBMI2-NEXT: retq
2407 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2409 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2410 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2411 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2414 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2415 ; X86-SSE2: # %bb.0:
2416 ; X86-SSE2-NEXT: psrlq $14, %xmm1
2417 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
2418 ; X86-SSE2-NEXT: psllq $50, %xmm0
2419 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
2420 ; X86-SSE2-NEXT: orpd %xmm1, %xmm0
2421 ; X86-SSE2-NEXT: retl
2422 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2426 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2427 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2429 ; SSE-NEXT: psrld $4, %xmm1
2430 ; SSE-NEXT: pslld $28, %xmm0
2431 ; SSE-NEXT: por %xmm1, %xmm0
2434 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2436 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2437 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2438 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2441 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2443 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2444 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2445 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2446 ; AVX512F-NEXT: retq
2448 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2449 ; AVX512VL: # %bb.0:
2450 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2451 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2452 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2453 ; AVX512VL-NEXT: retq
2455 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2456 ; AVX512BW: # %bb.0:
2457 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2458 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2459 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2460 ; AVX512BW-NEXT: retq
2462 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2463 ; AVX512VBMI2: # %bb.0:
2464 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2465 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2466 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
2467 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2468 ; AVX512VBMI2-NEXT: vzeroupper
2469 ; AVX512VBMI2-NEXT: retq
2471 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2472 ; AVX512VLBW: # %bb.0:
2473 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2474 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2475 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2476 ; AVX512VLBW-NEXT: retq
2478 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2479 ; AVX512VLVBMI2: # %bb.0:
2480 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2481 ; AVX512VLVBMI2-NEXT: retq
2483 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2485 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2486 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2487 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2490 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2491 ; X86-SSE2: # %bb.0:
2492 ; X86-SSE2-NEXT: psrld $4, %xmm1
2493 ; X86-SSE2-NEXT: pslld $28, %xmm0
2494 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2495 ; X86-SSE2-NEXT: retl
2496 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2500 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2501 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2503 ; SSE-NEXT: psrlw $7, %xmm1
2504 ; SSE-NEXT: psllw $9, %xmm0
2505 ; SSE-NEXT: por %xmm1, %xmm0
2508 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2510 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2511 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2512 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2515 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2517 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2518 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2519 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2520 ; AVX512F-NEXT: retq
2522 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2523 ; AVX512VL: # %bb.0:
2524 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2525 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2526 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2527 ; AVX512VL-NEXT: retq
2529 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2530 ; AVX512BW: # %bb.0:
2531 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2532 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2533 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2534 ; AVX512BW-NEXT: retq
2536 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2537 ; AVX512VBMI2: # %bb.0:
2538 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2539 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2540 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
2541 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2542 ; AVX512VBMI2-NEXT: vzeroupper
2543 ; AVX512VBMI2-NEXT: retq
2545 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2546 ; AVX512VLBW: # %bb.0:
2547 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2548 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2549 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2550 ; AVX512VLBW-NEXT: retq
2552 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2553 ; AVX512VLVBMI2: # %bb.0:
2554 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
2555 ; AVX512VLVBMI2-NEXT: retq
2557 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2559 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2560 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2561 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2564 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2565 ; X86-SSE2: # %bb.0:
2566 ; X86-SSE2-NEXT: psrlw $7, %xmm1
2567 ; X86-SSE2-NEXT: psllw $9, %xmm0
2568 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2569 ; X86-SSE2-NEXT: retl
2570 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2574 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2575 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2577 ; SSE-NEXT: psrlw $4, %xmm1
2578 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2579 ; SSE-NEXT: psllw $4, %xmm0
2580 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2581 ; SSE-NEXT: por %xmm1, %xmm0
2584 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2586 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2587 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2588 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2589 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2590 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2593 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2595 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm1
2596 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2597 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
2598 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2599 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2600 ; AVX512F-NEXT: retq
2602 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2603 ; AVX512VL: # %bb.0:
2604 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2605 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2606 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2607 ; AVX512VL-NEXT: retq
2609 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2610 ; AVX512BW: # %bb.0:
2611 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm1
2612 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2613 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
2614 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2615 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2616 ; AVX512BW-NEXT: retq
2618 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2619 ; AVX512VBMI2: # %bb.0:
2620 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm1
2621 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2622 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
2623 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2624 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2625 ; AVX512VBMI2-NEXT: retq
2627 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2628 ; AVX512VLBW: # %bb.0:
2629 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
2630 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
2631 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2632 ; AVX512VLBW-NEXT: retq
2634 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2635 ; AVX512VLVBMI2: # %bb.0:
2636 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2637 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2638 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2639 ; AVX512VLVBMI2-NEXT: retq
2641 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2643 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2644 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2645 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2648 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2649 ; X86-SSE2: # %bb.0:
2650 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2651 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2652 ; X86-SSE2-NEXT: psllw $4, %xmm0
2653 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2654 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2655 ; X86-SSE2-NEXT: retl
2656 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)