1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
31 ; SSE2-NEXT: movdqa %xmm1, %xmm3
32 ; SSE2-NEXT: psrlq %xmm2, %xmm3
33 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
34 ; SSE2-NEXT: movdqa %xmm1, %xmm5
35 ; SSE2-NEXT: psrlq %xmm4, %xmm5
36 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
37 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
38 ; SSE2-NEXT: psubq %xmm2, %xmm3
39 ; SSE2-NEXT: movdqa %xmm0, %xmm4
40 ; SSE2-NEXT: psllq %xmm3, %xmm4
41 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
42 ; SSE2-NEXT: psllq %xmm3, %xmm0
43 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
44 ; SSE2-NEXT: orpd %xmm5, %xmm0
45 ; SSE2-NEXT: pxor %xmm3, %xmm3
46 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
47 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
48 ; SSE2-NEXT: pand %xmm3, %xmm2
49 ; SSE2-NEXT: pand %xmm2, %xmm1
50 ; SSE2-NEXT: pandn %xmm0, %xmm2
51 ; SSE2-NEXT: por %xmm1, %xmm2
52 ; SSE2-NEXT: movdqa %xmm2, %xmm0
55 ; SSE41-LABEL: var_funnnel_v2i64:
57 ; SSE41-NEXT: movdqa %xmm0, %xmm3
58 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
59 ; SSE41-NEXT: movdqa %xmm1, %xmm0
60 ; SSE41-NEXT: psrlq %xmm2, %xmm0
61 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
62 ; SSE41-NEXT: movdqa %xmm1, %xmm5
63 ; SSE41-NEXT: psrlq %xmm4, %xmm5
64 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
65 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
66 ; SSE41-NEXT: psubq %xmm2, %xmm0
67 ; SSE41-NEXT: movdqa %xmm3, %xmm4
68 ; SSE41-NEXT: psllq %xmm0, %xmm4
69 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
70 ; SSE41-NEXT: psllq %xmm0, %xmm3
71 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
72 ; SSE41-NEXT: por %xmm5, %xmm3
73 ; SSE41-NEXT: pxor %xmm0, %xmm0
74 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
75 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
76 ; SSE41-NEXT: movapd %xmm3, %xmm0
79 ; AVX1-LABEL: var_funnnel_v2i64:
81 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
82 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
83 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
84 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
85 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
86 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
87 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
88 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
89 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
90 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
91 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
92 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
93 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
94 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
95 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
98 ; AVX2-LABEL: var_funnnel_v2i64:
100 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
101 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
102 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
103 ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
104 ; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
105 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
106 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
107 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
108 ; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
111 ; AVX512F-LABEL: var_funnnel_v2i64:
113 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
114 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
115 ; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
116 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
117 ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
118 ; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
119 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
120 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
121 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
122 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
123 ; AVX512F-NEXT: vzeroupper
126 ; AVX512VL-LABEL: var_funnnel_v2i64:
128 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
129 ; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
130 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
131 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
132 ; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
133 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
134 ; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
135 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
136 ; AVX512VL-NEXT: retq
138 ; AVX512BW-LABEL: var_funnnel_v2i64:
140 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
141 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
142 ; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
143 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
144 ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
145 ; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
146 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
147 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
148 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
149 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
150 ; AVX512BW-NEXT: vzeroupper
151 ; AVX512BW-NEXT: retq
153 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
154 ; AVX512VBMI2: # %bb.0:
155 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
156 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
157 ; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
158 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
159 ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
160 ; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
161 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
162 ; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
163 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
164 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
165 ; AVX512VBMI2-NEXT: vzeroupper
166 ; AVX512VBMI2-NEXT: retq
168 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
169 ; AVX512VLBW: # %bb.0:
170 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
171 ; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
172 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
173 ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
174 ; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
175 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
176 ; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
177 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
178 ; AVX512VLBW-NEXT: retq
180 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
181 ; AVX512VLVBMI2: # %bb.0:
182 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
183 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
184 ; AVX512VLVBMI2-NEXT: retq
186 ; XOPAVX1-LABEL: var_funnnel_v2i64:
188 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
189 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
190 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
191 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
192 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
193 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
194 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
195 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
196 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
197 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
200 ; XOPAVX2-LABEL: var_funnnel_v2i64:
202 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
203 ; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
204 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
205 ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
206 ; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
207 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
208 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
209 ; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
210 ; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
213 ; X32-SSE-LABEL: var_funnnel_v2i64:
215 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
216 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
217 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
218 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
219 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
220 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5
221 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
222 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
223 ; X32-SSE-NEXT: psubq %xmm2, %xmm3
224 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
225 ; X32-SSE-NEXT: psllq %xmm3, %xmm4
226 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
227 ; X32-SSE-NEXT: psllq %xmm3, %xmm0
228 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
229 ; X32-SSE-NEXT: orpd %xmm5, %xmm0
230 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
231 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
232 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
233 ; X32-SSE-NEXT: pand %xmm3, %xmm2
234 ; X32-SSE-NEXT: pand %xmm2, %xmm1
235 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
236 ; X32-SSE-NEXT: por %xmm1, %xmm2
237 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
239 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
243 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
244 ; SSE2-LABEL: var_funnnel_v4i32:
246 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
247 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
248 ; SSE2-NEXT: movdqa %xmm1, %xmm4
249 ; SSE2-NEXT: psrld %xmm3, %xmm4
250 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
251 ; SSE2-NEXT: movdqa %xmm1, %xmm3
252 ; SSE2-NEXT: psrld %xmm5, %xmm3
253 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
254 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
255 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
256 ; SSE2-NEXT: movdqa %xmm1, %xmm6
257 ; SSE2-NEXT: psrld %xmm5, %xmm6
258 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
259 ; SSE2-NEXT: movdqa %xmm1, %xmm5
260 ; SSE2-NEXT: psrld %xmm4, %xmm5
261 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
262 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
263 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
264 ; SSE2-NEXT: psubd %xmm2, %xmm4
265 ; SSE2-NEXT: pslld $23, %xmm4
266 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
267 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
268 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
269 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
270 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
271 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
272 ; SSE2-NEXT: pmuludq %xmm5, %xmm0
273 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
274 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
275 ; SSE2-NEXT: por %xmm3, %xmm6
276 ; SSE2-NEXT: pxor %xmm0, %xmm0
277 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
278 ; SSE2-NEXT: pand %xmm0, %xmm1
279 ; SSE2-NEXT: pandn %xmm6, %xmm0
280 ; SSE2-NEXT: por %xmm1, %xmm0
283 ; SSE41-LABEL: var_funnnel_v4i32:
285 ; SSE41-NEXT: movdqa %xmm0, %xmm3
286 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
287 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
288 ; SSE41-NEXT: movdqa %xmm1, %xmm4
289 ; SSE41-NEXT: psrld %xmm0, %xmm4
290 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
291 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
292 ; SSE41-NEXT: movdqa %xmm1, %xmm6
293 ; SSE41-NEXT: psrld %xmm5, %xmm6
294 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
295 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
296 ; SSE41-NEXT: movdqa %xmm1, %xmm5
297 ; SSE41-NEXT: psrld %xmm4, %xmm5
298 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
299 ; SSE41-NEXT: movdqa %xmm1, %xmm4
300 ; SSE41-NEXT: psrld %xmm0, %xmm4
301 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
302 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
303 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
304 ; SSE41-NEXT: psubd %xmm2, %xmm0
305 ; SSE41-NEXT: pslld $23, %xmm0
306 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
307 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
308 ; SSE41-NEXT: pmulld %xmm0, %xmm3
309 ; SSE41-NEXT: por %xmm4, %xmm3
310 ; SSE41-NEXT: pxor %xmm0, %xmm0
311 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
312 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
313 ; SSE41-NEXT: movaps %xmm3, %xmm0
316 ; AVX1-LABEL: var_funnnel_v4i32:
318 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
319 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
320 ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
321 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
322 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
323 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
324 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
325 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
326 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
327 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
328 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
329 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
330 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
331 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
332 ; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
333 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
334 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
335 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
336 ; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
337 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
338 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
339 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
342 ; AVX2-LABEL: var_funnnel_v4i32:
344 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
345 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
346 ; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
347 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
348 ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
349 ; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
350 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
351 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
352 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
353 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
356 ; AVX512F-LABEL: var_funnnel_v4i32:
358 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
359 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
360 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
361 ; AVX512F-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
362 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
363 ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
364 ; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
365 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
366 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
367 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
368 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
369 ; AVX512F-NEXT: vzeroupper
372 ; AVX512VL-LABEL: var_funnnel_v4i32:
374 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
375 ; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
376 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
377 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
378 ; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
379 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
380 ; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
381 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
382 ; AVX512VL-NEXT: retq
384 ; AVX512BW-LABEL: var_funnnel_v4i32:
386 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
387 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
388 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
389 ; AVX512BW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
390 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
391 ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
392 ; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
393 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
394 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
395 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
396 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
397 ; AVX512BW-NEXT: vzeroupper
398 ; AVX512BW-NEXT: retq
400 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
401 ; AVX512VBMI2: # %bb.0:
402 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
403 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
404 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
405 ; AVX512VBMI2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
406 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
407 ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
408 ; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
409 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
410 ; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
411 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
412 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
413 ; AVX512VBMI2-NEXT: vzeroupper
414 ; AVX512VBMI2-NEXT: retq
416 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
417 ; AVX512VLBW: # %bb.0:
418 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
419 ; AVX512VLBW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
420 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
421 ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
422 ; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
423 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
424 ; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
425 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
426 ; AVX512VLBW-NEXT: retq
428 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
429 ; AVX512VLVBMI2: # %bb.0:
430 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
431 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
432 ; AVX512VLVBMI2-NEXT: retq
434 ; XOPAVX1-LABEL: var_funnnel_v4i32:
436 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
437 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
438 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
439 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
440 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
441 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
442 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
443 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
444 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
445 ; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
448 ; XOPAVX2-LABEL: var_funnnel_v4i32:
450 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
451 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
452 ; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
453 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
454 ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
455 ; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
456 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
457 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
458 ; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
459 ; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
462 ; X32-SSE-LABEL: var_funnnel_v4i32:
464 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
465 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
466 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
467 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
468 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
469 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
470 ; X32-SSE-NEXT: psrld %xmm5, %xmm3
471 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
472 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
473 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
474 ; X32-SSE-NEXT: movdqa %xmm1, %xmm6
475 ; X32-SSE-NEXT: psrld %xmm5, %xmm6
476 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
477 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
478 ; X32-SSE-NEXT: psrld %xmm4, %xmm5
479 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
480 ; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
481 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
482 ; X32-SSE-NEXT: psubd %xmm2, %xmm4
483 ; X32-SSE-NEXT: pslld $23, %xmm4
484 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
485 ; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
486 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
487 ; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
488 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
489 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
490 ; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
491 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
492 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
493 ; X32-SSE-NEXT: por %xmm3, %xmm6
494 ; X32-SSE-NEXT: pxor %xmm0, %xmm0
495 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
496 ; X32-SSE-NEXT: pand %xmm0, %xmm1
497 ; X32-SSE-NEXT: pandn %xmm6, %xmm0
498 ; X32-SSE-NEXT: por %xmm1, %xmm0
500 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
504 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
505 ; SSE2-LABEL: var_funnnel_v8i16:
507 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
508 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
509 ; SSE2-NEXT: psubw %xmm2, %xmm4
510 ; SSE2-NEXT: pxor %xmm3, %xmm3
511 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm3
512 ; SSE2-NEXT: psllw $12, %xmm2
513 ; SSE2-NEXT: movdqa %xmm2, %xmm5
514 ; SSE2-NEXT: psraw $15, %xmm5
515 ; SSE2-NEXT: movdqa %xmm1, %xmm6
516 ; SSE2-NEXT: psrlw $8, %xmm6
517 ; SSE2-NEXT: pand %xmm5, %xmm6
518 ; SSE2-NEXT: pandn %xmm1, %xmm5
519 ; SSE2-NEXT: por %xmm6, %xmm5
520 ; SSE2-NEXT: paddw %xmm2, %xmm2
521 ; SSE2-NEXT: movdqa %xmm2, %xmm6
522 ; SSE2-NEXT: psraw $15, %xmm6
523 ; SSE2-NEXT: movdqa %xmm6, %xmm7
524 ; SSE2-NEXT: pandn %xmm5, %xmm7
525 ; SSE2-NEXT: psrlw $4, %xmm5
526 ; SSE2-NEXT: pand %xmm6, %xmm5
527 ; SSE2-NEXT: por %xmm7, %xmm5
528 ; SSE2-NEXT: paddw %xmm2, %xmm2
529 ; SSE2-NEXT: movdqa %xmm2, %xmm6
530 ; SSE2-NEXT: psraw $15, %xmm6
531 ; SSE2-NEXT: movdqa %xmm6, %xmm7
532 ; SSE2-NEXT: pandn %xmm5, %xmm7
533 ; SSE2-NEXT: psrlw $2, %xmm5
534 ; SSE2-NEXT: pand %xmm6, %xmm5
535 ; SSE2-NEXT: por %xmm7, %xmm5
536 ; SSE2-NEXT: paddw %xmm2, %xmm2
537 ; SSE2-NEXT: psraw $15, %xmm2
538 ; SSE2-NEXT: movdqa %xmm2, %xmm6
539 ; SSE2-NEXT: pandn %xmm5, %xmm6
540 ; SSE2-NEXT: psrlw $1, %xmm5
541 ; SSE2-NEXT: pand %xmm2, %xmm5
542 ; SSE2-NEXT: movdqa %xmm4, %xmm2
543 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
544 ; SSE2-NEXT: pslld $23, %xmm2
545 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
546 ; SSE2-NEXT: paddd %xmm7, %xmm2
547 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
549 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
550 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
551 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
552 ; SSE2-NEXT: pslld $23, %xmm4
553 ; SSE2-NEXT: paddd %xmm7, %xmm4
554 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
555 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
556 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
557 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
558 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
559 ; SSE2-NEXT: pmullw %xmm0, %xmm4
560 ; SSE2-NEXT: por %xmm6, %xmm4
561 ; SSE2-NEXT: por %xmm5, %xmm4
562 ; SSE2-NEXT: pand %xmm3, %xmm1
563 ; SSE2-NEXT: pandn %xmm4, %xmm3
564 ; SSE2-NEXT: por %xmm1, %xmm3
565 ; SSE2-NEXT: movdqa %xmm3, %xmm0
568 ; SSE41-LABEL: var_funnnel_v8i16:
570 ; SSE41-NEXT: movdqa %xmm0, %xmm3
571 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
572 ; SSE41-NEXT: movdqa %xmm2, %xmm0
573 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
574 ; SSE41-NEXT: psubw %xmm2, %xmm5
575 ; SSE41-NEXT: pxor %xmm4, %xmm4
576 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
577 ; SSE41-NEXT: psllw $12, %xmm2
578 ; SSE41-NEXT: psllw $4, %xmm0
579 ; SSE41-NEXT: por %xmm2, %xmm0
580 ; SSE41-NEXT: movdqa %xmm0, %xmm2
581 ; SSE41-NEXT: paddw %xmm0, %xmm2
582 ; SSE41-NEXT: movdqa %xmm1, %xmm6
583 ; SSE41-NEXT: psrlw $8, %xmm6
584 ; SSE41-NEXT: movdqa %xmm1, %xmm7
585 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
586 ; SSE41-NEXT: movdqa %xmm7, %xmm6
587 ; SSE41-NEXT: psrlw $4, %xmm6
588 ; SSE41-NEXT: movdqa %xmm2, %xmm0
589 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
590 ; SSE41-NEXT: movdqa %xmm7, %xmm6
591 ; SSE41-NEXT: psrlw $2, %xmm6
592 ; SSE41-NEXT: paddw %xmm2, %xmm2
593 ; SSE41-NEXT: movdqa %xmm2, %xmm0
594 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
595 ; SSE41-NEXT: movdqa %xmm7, %xmm6
596 ; SSE41-NEXT: psrlw $1, %xmm6
597 ; SSE41-NEXT: paddw %xmm2, %xmm2
598 ; SSE41-NEXT: movdqa %xmm2, %xmm0
599 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
600 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
601 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
602 ; SSE41-NEXT: pslld $23, %xmm5
603 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
604 ; SSE41-NEXT: paddd %xmm2, %xmm5
605 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm5
606 ; SSE41-NEXT: pslld $23, %xmm0
607 ; SSE41-NEXT: paddd %xmm2, %xmm0
608 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
609 ; SSE41-NEXT: packusdw %xmm5, %xmm0
610 ; SSE41-NEXT: pmullw %xmm0, %xmm3
611 ; SSE41-NEXT: por %xmm7, %xmm3
612 ; SSE41-NEXT: movdqa %xmm4, %xmm0
613 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
614 ; SSE41-NEXT: movdqa %xmm3, %xmm0
617 ; AVX1-LABEL: var_funnnel_v8i16:
619 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
620 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
621 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
622 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
623 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
624 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
625 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
626 ; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
627 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
628 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
629 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
630 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
631 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
632 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
633 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
634 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
635 ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
636 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
637 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
638 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
639 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
640 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
641 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
642 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
643 ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
644 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
645 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
646 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
647 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
648 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
649 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
650 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
653 ; AVX2-LABEL: var_funnnel_v8i16:
655 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
656 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
657 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
658 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
659 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
660 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
661 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
662 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
663 ; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5
664 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
665 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
666 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm0
667 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
668 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
669 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
670 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
671 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
672 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
673 ; AVX2-NEXT: vzeroupper
676 ; AVX512F-LABEL: var_funnnel_v8i16:
678 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
679 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
680 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
681 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
682 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
683 ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
684 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
685 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
686 ; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
687 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
688 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
689 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
690 ; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
691 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
692 ; AVX512F-NEXT: vzeroupper
695 ; AVX512VL-LABEL: var_funnnel_v8i16:
697 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
698 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
699 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
700 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
701 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
702 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
703 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
704 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
705 ; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
706 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
707 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
708 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
709 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
710 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
711 ; AVX512VL-NEXT: vzeroupper
712 ; AVX512VL-NEXT: retq
714 ; AVX512BW-LABEL: var_funnnel_v8i16:
716 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
717 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
718 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
719 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
720 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
721 ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
722 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
723 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
724 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
725 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
726 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
727 ; AVX512BW-NEXT: vzeroupper
728 ; AVX512BW-NEXT: retq
730 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
731 ; AVX512VBMI2: # %bb.0:
732 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
733 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
734 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
735 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
736 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
737 ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
738 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
739 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
740 ; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
741 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
742 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
743 ; AVX512VBMI2-NEXT: vzeroupper
744 ; AVX512VBMI2-NEXT: retq
746 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
747 ; AVX512VLBW: # %bb.0:
748 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
749 ; AVX512VLBW-NEXT: vpsrlvw %xmm2, %xmm1, %xmm3
750 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
751 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
752 ; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
753 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
754 ; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
755 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
756 ; AVX512VLBW-NEXT: retq
758 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
759 ; AVX512VLVBMI2: # %bb.0:
760 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
761 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
762 ; AVX512VLVBMI2-NEXT: retq
764 ; XOP-LABEL: var_funnnel_v8i16:
766 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
767 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
768 ; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
769 ; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
770 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
771 ; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
772 ; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
773 ; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
774 ; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
775 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
778 ; X32-SSE-LABEL: var_funnnel_v8i16:
780 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
781 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
782 ; X32-SSE-NEXT: psubw %xmm2, %xmm4
783 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
784 ; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3
785 ; X32-SSE-NEXT: psllw $12, %xmm2
786 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
787 ; X32-SSE-NEXT: psraw $15, %xmm5
788 ; X32-SSE-NEXT: movdqa %xmm1, %xmm6
789 ; X32-SSE-NEXT: psrlw $8, %xmm6
790 ; X32-SSE-NEXT: pand %xmm5, %xmm6
791 ; X32-SSE-NEXT: pandn %xmm1, %xmm5
792 ; X32-SSE-NEXT: por %xmm6, %xmm5
793 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
794 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
795 ; X32-SSE-NEXT: psraw $15, %xmm6
796 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
797 ; X32-SSE-NEXT: pandn %xmm5, %xmm7
798 ; X32-SSE-NEXT: psrlw $4, %xmm5
799 ; X32-SSE-NEXT: pand %xmm6, %xmm5
800 ; X32-SSE-NEXT: por %xmm7, %xmm5
801 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
802 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
803 ; X32-SSE-NEXT: psraw $15, %xmm6
804 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
805 ; X32-SSE-NEXT: pandn %xmm5, %xmm7
806 ; X32-SSE-NEXT: psrlw $2, %xmm5
807 ; X32-SSE-NEXT: pand %xmm6, %xmm5
808 ; X32-SSE-NEXT: por %xmm7, %xmm5
809 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
810 ; X32-SSE-NEXT: psraw $15, %xmm2
811 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
812 ; X32-SSE-NEXT: pandn %xmm5, %xmm6
813 ; X32-SSE-NEXT: psrlw $1, %xmm5
814 ; X32-SSE-NEXT: pand %xmm2, %xmm5
815 ; X32-SSE-NEXT: movdqa %xmm4, %xmm2
816 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
817 ; X32-SSE-NEXT: pslld $23, %xmm2
818 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
819 ; X32-SSE-NEXT: paddd %xmm7, %xmm2
820 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
821 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
822 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
823 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
824 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
825 ; X32-SSE-NEXT: pslld $23, %xmm4
826 ; X32-SSE-NEXT: paddd %xmm7, %xmm4
827 ; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
828 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
829 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
830 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
831 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
832 ; X32-SSE-NEXT: pmullw %xmm0, %xmm4
833 ; X32-SSE-NEXT: por %xmm6, %xmm4
834 ; X32-SSE-NEXT: por %xmm5, %xmm4
835 ; X32-SSE-NEXT: pand %xmm3, %xmm1
836 ; X32-SSE-NEXT: pandn %xmm4, %xmm3
837 ; X32-SSE-NEXT: por %xmm1, %xmm3
838 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0
840 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
844 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
845 ; SSE2-LABEL: var_funnnel_v16i8:
847 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
848 ; SSE2-NEXT: movdqa %xmm2, %xmm5
849 ; SSE2-NEXT: psllw $5, %xmm5
850 ; SSE2-NEXT: pxor %xmm3, %xmm3
851 ; SSE2-NEXT: pxor %xmm6, %xmm6
852 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
853 ; SSE2-NEXT: movdqa %xmm1, %xmm4
854 ; SSE2-NEXT: psrlw $4, %xmm4
855 ; SSE2-NEXT: pand %xmm6, %xmm4
856 ; SSE2-NEXT: pandn %xmm1, %xmm6
857 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
858 ; SSE2-NEXT: por %xmm6, %xmm4
859 ; SSE2-NEXT: paddb %xmm5, %xmm5
860 ; SSE2-NEXT: pxor %xmm6, %xmm6
861 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
862 ; SSE2-NEXT: movdqa %xmm6, %xmm7
863 ; SSE2-NEXT: pandn %xmm4, %xmm7
864 ; SSE2-NEXT: psrlw $2, %xmm4
865 ; SSE2-NEXT: pand %xmm6, %xmm4
866 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
867 ; SSE2-NEXT: por %xmm7, %xmm4
868 ; SSE2-NEXT: paddb %xmm5, %xmm5
869 ; SSE2-NEXT: pxor %xmm6, %xmm6
870 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
871 ; SSE2-NEXT: movdqa %xmm6, %xmm5
872 ; SSE2-NEXT: pandn %xmm4, %xmm5
873 ; SSE2-NEXT: psrlw $1, %xmm4
874 ; SSE2-NEXT: pand %xmm6, %xmm4
875 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
876 ; SSE2-NEXT: por %xmm5, %xmm4
877 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
878 ; SSE2-NEXT: psubb %xmm2, %xmm5
879 ; SSE2-NEXT: psllw $5, %xmm5
880 ; SSE2-NEXT: pxor %xmm6, %xmm6
881 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
882 ; SSE2-NEXT: movdqa %xmm6, %xmm7
883 ; SSE2-NEXT: pandn %xmm0, %xmm7
884 ; SSE2-NEXT: psllw $4, %xmm0
885 ; SSE2-NEXT: pand %xmm6, %xmm0
886 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
887 ; SSE2-NEXT: por %xmm7, %xmm0
888 ; SSE2-NEXT: paddb %xmm5, %xmm5
889 ; SSE2-NEXT: pxor %xmm6, %xmm6
890 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
891 ; SSE2-NEXT: movdqa %xmm6, %xmm7
892 ; SSE2-NEXT: pandn %xmm0, %xmm7
893 ; SSE2-NEXT: psllw $2, %xmm0
894 ; SSE2-NEXT: pand %xmm6, %xmm0
895 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
896 ; SSE2-NEXT: por %xmm7, %xmm0
897 ; SSE2-NEXT: paddb %xmm5, %xmm5
898 ; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
899 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
900 ; SSE2-NEXT: movdqa %xmm3, %xmm5
901 ; SSE2-NEXT: pandn %xmm0, %xmm5
902 ; SSE2-NEXT: por %xmm4, %xmm5
903 ; SSE2-NEXT: paddb %xmm0, %xmm0
904 ; SSE2-NEXT: pand %xmm3, %xmm0
905 ; SSE2-NEXT: por %xmm5, %xmm0
906 ; SSE2-NEXT: pand %xmm2, %xmm1
907 ; SSE2-NEXT: pandn %xmm0, %xmm2
908 ; SSE2-NEXT: por %xmm1, %xmm2
909 ; SSE2-NEXT: movdqa %xmm2, %xmm0
912 ; SSE41-LABEL: var_funnnel_v16i8:
914 ; SSE41-NEXT: movdqa %xmm0, %xmm3
915 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
916 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
917 ; SSE41-NEXT: psubb %xmm2, %xmm4
918 ; SSE41-NEXT: pxor %xmm5, %xmm5
919 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
920 ; SSE41-NEXT: movdqa %xmm2, %xmm0
921 ; SSE41-NEXT: psllw $5, %xmm0
922 ; SSE41-NEXT: movdqa %xmm1, %xmm2
923 ; SSE41-NEXT: psrlw $4, %xmm2
924 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
925 ; SSE41-NEXT: movdqa %xmm1, %xmm6
926 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
927 ; SSE41-NEXT: movdqa %xmm6, %xmm2
928 ; SSE41-NEXT: psrlw $2, %xmm2
929 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
930 ; SSE41-NEXT: paddb %xmm0, %xmm0
931 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
932 ; SSE41-NEXT: movdqa %xmm6, %xmm2
933 ; SSE41-NEXT: psrlw $1, %xmm2
934 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
935 ; SSE41-NEXT: paddb %xmm0, %xmm0
936 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
937 ; SSE41-NEXT: psllw $5, %xmm4
938 ; SSE41-NEXT: movdqa %xmm4, %xmm2
939 ; SSE41-NEXT: paddb %xmm4, %xmm2
940 ; SSE41-NEXT: movdqa %xmm3, %xmm7
941 ; SSE41-NEXT: psllw $4, %xmm7
942 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
943 ; SSE41-NEXT: movdqa %xmm4, %xmm0
944 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
945 ; SSE41-NEXT: movdqa %xmm3, %xmm4
946 ; SSE41-NEXT: psllw $2, %xmm4
947 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
948 ; SSE41-NEXT: movdqa %xmm2, %xmm0
949 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
950 ; SSE41-NEXT: movdqa %xmm3, %xmm4
951 ; SSE41-NEXT: paddb %xmm3, %xmm4
952 ; SSE41-NEXT: paddb %xmm2, %xmm2
953 ; SSE41-NEXT: movdqa %xmm2, %xmm0
954 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
955 ; SSE41-NEXT: por %xmm6, %xmm3
956 ; SSE41-NEXT: movdqa %xmm5, %xmm0
957 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
958 ; SSE41-NEXT: movdqa %xmm3, %xmm0
961 ; AVX-LABEL: var_funnnel_v16i8:
963 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
964 ; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
965 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
966 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
967 ; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
968 ; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
969 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
970 ; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
971 ; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
972 ; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
973 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
974 ; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
975 ; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
976 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
977 ; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
978 ; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
979 ; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
980 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
981 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
982 ; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
983 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
984 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
985 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
986 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
987 ; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
988 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
989 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
990 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
991 ; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
992 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
995 ; AVX512F-LABEL: var_funnnel_v16i8:
997 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
998 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
999 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1000 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1001 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1002 ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1003 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1004 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1005 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1006 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1007 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1008 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1009 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1010 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1011 ; AVX512F-NEXT: vzeroupper
1012 ; AVX512F-NEXT: retq
1014 ; AVX512VL-LABEL: var_funnnel_v16i8:
1015 ; AVX512VL: # %bb.0:
1016 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1017 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1018 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1019 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1020 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1021 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1022 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1023 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1024 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1025 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
1026 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1027 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1028 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1029 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1030 ; AVX512VL-NEXT: vzeroupper
1031 ; AVX512VL-NEXT: retq
1033 ; AVX512BW-LABEL: var_funnnel_v16i8:
1034 ; AVX512BW: # %bb.0:
1035 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1036 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1037 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1038 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1039 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
1040 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1041 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1042 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1043 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1044 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
1045 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1046 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1047 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
1048 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1049 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1050 ; AVX512BW-NEXT: vzeroupper
1051 ; AVX512BW-NEXT: retq
1053 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
1054 ; AVX512VBMI2: # %bb.0:
1055 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1056 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1057 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1058 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1059 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
1060 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1061 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1062 ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1063 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1064 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
1065 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
1066 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1067 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
1068 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1069 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1070 ; AVX512VBMI2-NEXT: vzeroupper
1071 ; AVX512VBMI2-NEXT: retq
1073 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
1074 ; AVX512VLBW: # %bb.0:
1075 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1076 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1077 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1078 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
1079 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1080 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1081 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1082 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1083 ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1084 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
1085 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1086 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
1087 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1088 ; AVX512VLBW-NEXT: vzeroupper
1089 ; AVX512VLBW-NEXT: retq
1091 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
1092 ; AVX512VLVBMI2: # %bb.0:
1093 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1094 ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1095 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1096 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
1097 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1098 ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1099 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1100 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1101 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1102 ; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
1103 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1104 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
1105 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1106 ; AVX512VLVBMI2-NEXT: vzeroupper
1107 ; AVX512VLVBMI2-NEXT: retq
1109 ; XOP-LABEL: var_funnnel_v16i8:
1111 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1112 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1113 ; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1114 ; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1115 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1116 ; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1117 ; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1118 ; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
1119 ; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1120 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1123 ; X32-SSE-LABEL: var_funnnel_v16i8:
1125 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1126 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
1127 ; X32-SSE-NEXT: psllw $5, %xmm5
1128 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1129 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1130 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1131 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1132 ; X32-SSE-NEXT: psrlw $4, %xmm4
1133 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1134 ; X32-SSE-NEXT: pandn %xmm1, %xmm6
1135 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1136 ; X32-SSE-NEXT: por %xmm6, %xmm4
1137 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1138 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1139 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1140 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1141 ; X32-SSE-NEXT: pandn %xmm4, %xmm7
1142 ; X32-SSE-NEXT: psrlw $2, %xmm4
1143 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1144 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1145 ; X32-SSE-NEXT: por %xmm7, %xmm4
1146 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1147 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1148 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1149 ; X32-SSE-NEXT: movdqa %xmm6, %xmm5
1150 ; X32-SSE-NEXT: pandn %xmm4, %xmm5
1151 ; X32-SSE-NEXT: psrlw $1, %xmm4
1152 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1153 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1154 ; X32-SSE-NEXT: por %xmm5, %xmm4
1155 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1156 ; X32-SSE-NEXT: psubb %xmm2, %xmm5
1157 ; X32-SSE-NEXT: psllw $5, %xmm5
1158 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1159 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1160 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1161 ; X32-SSE-NEXT: pandn %xmm0, %xmm7
1162 ; X32-SSE-NEXT: psllw $4, %xmm0
1163 ; X32-SSE-NEXT: pand %xmm6, %xmm0
1164 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1165 ; X32-SSE-NEXT: por %xmm7, %xmm0
1166 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1167 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1168 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1169 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1170 ; X32-SSE-NEXT: pandn %xmm0, %xmm7
1171 ; X32-SSE-NEXT: psllw $2, %xmm0
1172 ; X32-SSE-NEXT: pand %xmm6, %xmm0
1173 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1174 ; X32-SSE-NEXT: por %xmm7, %xmm0
1175 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1176 ; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1177 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
1178 ; X32-SSE-NEXT: movdqa %xmm3, %xmm5
1179 ; X32-SSE-NEXT: pandn %xmm0, %xmm5
1180 ; X32-SSE-NEXT: por %xmm4, %xmm5
1181 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
1182 ; X32-SSE-NEXT: pand %xmm3, %xmm0
1183 ; X32-SSE-NEXT: por %xmm5, %xmm0
1184 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1185 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1186 ; X32-SSE-NEXT: por %xmm1, %xmm2
1187 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1188 ; X32-SSE-NEXT: retl
1189 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1194 ; Uniform Variable Shifts
1197 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1198 ; SSE2-LABEL: splatvar_funnnel_v2i64:
1200 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1201 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1202 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1203 ; SSE2-NEXT: psrlq %xmm2, %xmm3
1204 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1205 ; SSE2-NEXT: psubq %xmm2, %xmm4
1206 ; SSE2-NEXT: psllq %xmm4, %xmm0
1207 ; SSE2-NEXT: por %xmm3, %xmm0
1208 ; SSE2-NEXT: pxor %xmm3, %xmm3
1209 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
1210 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1211 ; SSE2-NEXT: pand %xmm3, %xmm2
1212 ; SSE2-NEXT: pand %xmm2, %xmm1
1213 ; SSE2-NEXT: pandn %xmm0, %xmm2
1214 ; SSE2-NEXT: por %xmm1, %xmm2
1215 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1218 ; SSE41-LABEL: splatvar_funnnel_v2i64:
1220 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1221 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1222 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1223 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1224 ; SSE41-NEXT: psrlq %xmm2, %xmm0
1225 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1226 ; SSE41-NEXT: psubq %xmm2, %xmm4
1227 ; SSE41-NEXT: psllq %xmm4, %xmm3
1228 ; SSE41-NEXT: por %xmm0, %xmm3
1229 ; SSE41-NEXT: pxor %xmm0, %xmm0
1230 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
1231 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1232 ; SSE41-NEXT: movapd %xmm3, %xmm0
1235 ; AVX1-LABEL: splatvar_funnnel_v2i64:
1237 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1238 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1239 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1240 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1241 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1242 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1243 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1244 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1245 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1246 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1249 ; AVX2-LABEL: splatvar_funnnel_v2i64:
1251 ; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1252 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1253 ; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1254 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1255 ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1256 ; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1257 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1258 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1259 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1260 ; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1263 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1265 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1266 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
1267 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1268 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1269 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1270 ; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1271 ; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1272 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1273 ; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
1274 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1275 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1276 ; AVX512F-NEXT: vzeroupper
1277 ; AVX512F-NEXT: retq
1279 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1280 ; AVX512VL: # %bb.0:
1281 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
1282 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1283 ; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1284 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1285 ; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1286 ; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1287 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1288 ; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
1289 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1290 ; AVX512VL-NEXT: retq
1292 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1293 ; AVX512BW: # %bb.0:
1294 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1295 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
1296 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1297 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1298 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1299 ; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1300 ; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1301 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
1302 ; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
1303 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1304 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1305 ; AVX512BW-NEXT: vzeroupper
1306 ; AVX512BW-NEXT: retq
1308 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1309 ; AVX512VBMI2: # %bb.0:
1310 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1311 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1312 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1313 ; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1314 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1315 ; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1316 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1317 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
1318 ; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
1319 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1320 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1321 ; AVX512VBMI2-NEXT: vzeroupper
1322 ; AVX512VBMI2-NEXT: retq
1324 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1325 ; AVX512VLBW: # %bb.0:
1326 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
1327 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1328 ; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1329 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1330 ; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1331 ; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1332 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
1333 ; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
1334 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1335 ; AVX512VLBW-NEXT: retq
1337 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1338 ; AVX512VLVBMI2: # %bb.0:
1339 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1340 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1341 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1342 ; AVX512VLVBMI2-NEXT: retq
1344 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1346 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1347 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1348 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1349 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1350 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1351 ; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1352 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1353 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1354 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1355 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1356 ; XOPAVX1-NEXT: retq
1358 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1360 ; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1361 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1362 ; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1363 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1364 ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1365 ; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1366 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1367 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1368 ; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1369 ; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1370 ; XOPAVX2-NEXT: retq
1372 ; X32-SSE-LABEL: splatvar_funnnel_v2i64:
1374 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1375 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1376 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1377 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
1378 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
1379 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1380 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5
1381 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1382 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
1383 ; X32-SSE-NEXT: psubq %xmm2, %xmm3
1384 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1385 ; X32-SSE-NEXT: psllq %xmm3, %xmm4
1386 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1387 ; X32-SSE-NEXT: psllq %xmm3, %xmm0
1388 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1389 ; X32-SSE-NEXT: orpd %xmm5, %xmm0
1390 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1391 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
1392 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1393 ; X32-SSE-NEXT: pand %xmm3, %xmm2
1394 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1395 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1396 ; X32-SSE-NEXT: por %xmm1, %xmm2
1397 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1398 ; X32-SSE-NEXT: retl
1399 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1400 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1404 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1405 ; SSE2-LABEL: splatvar_funnnel_v4i32:
1407 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1408 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1409 ; SSE2-NEXT: pxor %xmm3, %xmm3
1410 ; SSE2-NEXT: xorps %xmm4, %xmm4
1411 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1412 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1413 ; SSE2-NEXT: psrld %xmm4, %xmm5
1414 ; SSE2-NEXT: movd %xmm2, %eax
1415 ; SSE2-NEXT: movl $32, %ecx
1416 ; SSE2-NEXT: subl %eax, %ecx
1417 ; SSE2-NEXT: movd %ecx, %xmm4
1418 ; SSE2-NEXT: pslld %xmm4, %xmm0
1419 ; SSE2-NEXT: por %xmm5, %xmm0
1420 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
1421 ; SSE2-NEXT: pand %xmm2, %xmm1
1422 ; SSE2-NEXT: pandn %xmm0, %xmm2
1423 ; SSE2-NEXT: por %xmm1, %xmm2
1424 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1427 ; SSE41-LABEL: splatvar_funnnel_v4i32:
1429 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1430 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1431 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1432 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1433 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1434 ; SSE41-NEXT: psrld %xmm0, %xmm4
1435 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
1436 ; SSE41-NEXT: psubd %xmm2, %xmm0
1437 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1438 ; SSE41-NEXT: pslld %xmm0, %xmm3
1439 ; SSE41-NEXT: por %xmm4, %xmm3
1440 ; SSE41-NEXT: pxor %xmm0, %xmm0
1441 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
1442 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
1443 ; SSE41-NEXT: movaps %xmm3, %xmm0
1446 ; AVX1-LABEL: splatvar_funnnel_v4i32:
1448 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1449 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1450 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1451 ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1452 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1453 ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1454 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1455 ; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1456 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1457 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1458 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1459 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1462 ; AVX2-LABEL: splatvar_funnnel_v4i32:
1464 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1465 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1466 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1467 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1468 ; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1469 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1470 ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1471 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1472 ; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1473 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1474 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1475 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1476 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1479 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1481 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1482 ; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
1483 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1484 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
1485 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1486 ; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1487 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1488 ; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1489 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1490 ; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
1491 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1492 ; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
1493 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1494 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1495 ; AVX512F-NEXT: vzeroupper
1496 ; AVX512F-NEXT: retq
1498 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1499 ; AVX512VL: # %bb.0:
1500 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
1501 ; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
1502 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1503 ; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1504 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1505 ; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1506 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1507 ; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
1508 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1509 ; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
1510 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1511 ; AVX512VL-NEXT: retq
1513 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1514 ; AVX512BW: # %bb.0:
1515 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1516 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
1517 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1518 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
1519 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1520 ; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1521 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1522 ; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1523 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1524 ; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1525 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
1526 ; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
1527 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1528 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1529 ; AVX512BW-NEXT: vzeroupper
1530 ; AVX512BW-NEXT: retq
1532 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1533 ; AVX512VBMI2: # %bb.0:
1534 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1535 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1536 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1537 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
1538 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1539 ; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1540 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1541 ; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1542 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1543 ; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1544 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
1545 ; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
1546 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1547 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1548 ; AVX512VBMI2-NEXT: vzeroupper
1549 ; AVX512VBMI2-NEXT: retq
1551 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1552 ; AVX512VLBW: # %bb.0:
1553 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
1554 ; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
1555 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1556 ; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1557 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1558 ; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1559 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1560 ; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1561 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
1562 ; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
1563 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1564 ; AVX512VLBW-NEXT: retq
1566 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1567 ; AVX512VLVBMI2: # %bb.0:
1568 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1569 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1570 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1571 ; AVX512VLVBMI2-NEXT: retq
1573 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1575 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1576 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1577 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1578 ; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1579 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1580 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1581 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1582 ; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1583 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1584 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1585 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1586 ; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1587 ; XOPAVX1-NEXT: retq
1589 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1591 ; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1592 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1593 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1594 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1595 ; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1596 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1597 ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1598 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1599 ; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1600 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1601 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1602 ; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1603 ; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1604 ; XOPAVX2-NEXT: retq
1606 ; X32-SSE-LABEL: splatvar_funnnel_v4i32:
1608 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1609 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1610 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1611 ; X32-SSE-NEXT: xorps %xmm4, %xmm4
1612 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1613 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1614 ; X32-SSE-NEXT: psrld %xmm4, %xmm5
1615 ; X32-SSE-NEXT: movd %xmm2, %eax
1616 ; X32-SSE-NEXT: movl $32, %ecx
1617 ; X32-SSE-NEXT: subl %eax, %ecx
1618 ; X32-SSE-NEXT: movd %ecx, %xmm4
1619 ; X32-SSE-NEXT: pslld %xmm4, %xmm0
1620 ; X32-SSE-NEXT: por %xmm5, %xmm0
1621 ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
1622 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1623 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1624 ; X32-SSE-NEXT: por %xmm1, %xmm2
1625 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1626 ; X32-SSE-NEXT: retl
1627 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1628 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1632 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1633 ; SSE2-LABEL: splatvar_funnnel_v8i16:
1635 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1636 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1637 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1638 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1639 ; SSE2-NEXT: psubw %xmm3, %xmm4
1640 ; SSE2-NEXT: pxor %xmm2, %xmm2
1641 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
1642 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1643 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1644 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1645 ; SSE2-NEXT: psrlw %xmm3, %xmm5
1646 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1647 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1648 ; SSE2-NEXT: psllw %xmm4, %xmm0
1649 ; SSE2-NEXT: por %xmm5, %xmm0
1650 ; SSE2-NEXT: pand %xmm2, %xmm1
1651 ; SSE2-NEXT: pandn %xmm0, %xmm2
1652 ; SSE2-NEXT: por %xmm1, %xmm2
1653 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1656 ; SSE41-LABEL: splatvar_funnnel_v8i16:
1658 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1659 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
1660 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
1661 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1662 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1663 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1664 ; SSE41-NEXT: psrlw %xmm0, %xmm4
1665 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
1666 ; SSE41-NEXT: psubw %xmm2, %xmm0
1667 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1668 ; SSE41-NEXT: psllw %xmm0, %xmm3
1669 ; SSE41-NEXT: por %xmm4, %xmm3
1670 ; SSE41-NEXT: pxor %xmm0, %xmm0
1671 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
1672 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1673 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1676 ; AVX1-LABEL: splatvar_funnnel_v8i16:
1678 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1679 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1680 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1681 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1682 ; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1683 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1684 ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1685 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1686 ; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1687 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1688 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1689 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1690 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1693 ; AVX2-LABEL: splatvar_funnnel_v8i16:
1695 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1696 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1697 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1698 ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1699 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1700 ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1701 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1702 ; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1703 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1704 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1705 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1706 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1709 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1711 ; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
1712 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1713 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1714 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1715 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1716 ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1717 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1718 ; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1719 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1720 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1721 ; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1722 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1723 ; AVX512F-NEXT: retq
1725 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1726 ; AVX512VL: # %bb.0:
1727 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
1728 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1729 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1730 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1731 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1732 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1733 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1734 ; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1735 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1736 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1737 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1738 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1739 ; AVX512VL-NEXT: retq
1741 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1742 ; AVX512BW: # %bb.0:
1743 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1744 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
1745 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1746 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1747 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1748 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1749 ; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1750 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1751 ; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1752 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
1753 ; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
1754 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1755 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1756 ; AVX512BW-NEXT: vzeroupper
1757 ; AVX512BW-NEXT: retq
1759 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1760 ; AVX512VBMI2: # %bb.0:
1761 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1762 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1763 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1764 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1765 ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1766 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1767 ; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1768 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1769 ; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1770 ; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
1771 ; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
1772 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1773 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1774 ; AVX512VBMI2-NEXT: vzeroupper
1775 ; AVX512VBMI2-NEXT: retq
1777 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1778 ; AVX512VLBW: # %bb.0:
1779 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
1780 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1781 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1782 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1783 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1784 ; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1785 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1786 ; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1787 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
1788 ; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
1789 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
1790 ; AVX512VLBW-NEXT: retq
1792 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1793 ; AVX512VLVBMI2: # %bb.0:
1794 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1795 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1796 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1797 ; AVX512VLVBMI2-NEXT: retq
1799 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1801 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1802 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1803 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1804 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1805 ; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1806 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1807 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1808 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1809 ; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1810 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1811 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1812 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1813 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1814 ; XOPAVX1-NEXT: retq
1816 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1818 ; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1819 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1820 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1821 ; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1822 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1823 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1824 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1825 ; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1826 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1827 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1828 ; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1829 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1830 ; XOPAVX2-NEXT: retq
1832 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
1834 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1835 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1836 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1837 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1838 ; X32-SSE-NEXT: psubw %xmm3, %xmm4
1839 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
1840 ; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
1841 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1842 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1843 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1844 ; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1845 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1846 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1847 ; X32-SSE-NEXT: psllw %xmm4, %xmm0
1848 ; X32-SSE-NEXT: por %xmm5, %xmm0
1849 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1850 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1851 ; X32-SSE-NEXT: por %xmm1, %xmm2
1852 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1853 ; X32-SSE-NEXT: retl
1854 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1855 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1859 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1860 ; SSE2-LABEL: splatvar_funnnel_v16i8:
1862 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1863 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1864 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1865 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1866 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1867 ; SSE2-NEXT: psubb %xmm3, %xmm4
1868 ; SSE2-NEXT: pxor %xmm2, %xmm2
1869 ; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
1870 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1871 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1872 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1873 ; SSE2-NEXT: psrlw %xmm3, %xmm5
1874 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1875 ; SSE2-NEXT: psrlw %xmm3, %xmm6
1876 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
1877 ; SSE2-NEXT: psrlw $8, %xmm6
1878 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1879 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1880 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1881 ; SSE2-NEXT: pand %xmm5, %xmm6
1882 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1883 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1884 ; SSE2-NEXT: psllw %xmm4, %xmm0
1885 ; SSE2-NEXT: psllw %xmm4, %xmm3
1886 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1887 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1888 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1889 ; SSE2-NEXT: pand %xmm0, %xmm3
1890 ; SSE2-NEXT: por %xmm6, %xmm3
1891 ; SSE2-NEXT: pand %xmm2, %xmm1
1892 ; SSE2-NEXT: pandn %xmm3, %xmm2
1893 ; SSE2-NEXT: por %xmm1, %xmm2
1894 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1897 ; SSE41-LABEL: splatvar_funnnel_v16i8:
1899 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1900 ; SSE41-NEXT: pxor %xmm0, %xmm0
1901 ; SSE41-NEXT: pshufb %xmm0, %xmm2
1902 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1903 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1904 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1905 ; SSE41-NEXT: psrlw %xmm4, %xmm5
1906 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1907 ; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
1908 ; SSE41-NEXT: psrlw %xmm4, %xmm7
1909 ; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1910 ; SSE41-NEXT: pand %xmm5, %xmm7
1911 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1912 ; SSE41-NEXT: psubb %xmm2, %xmm4
1913 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1914 ; SSE41-NEXT: psllw %xmm4, %xmm3
1915 ; SSE41-NEXT: psllw %xmm4, %xmm6
1916 ; SSE41-NEXT: pshufb %xmm0, %xmm6
1917 ; SSE41-NEXT: pand %xmm6, %xmm3
1918 ; SSE41-NEXT: por %xmm7, %xmm3
1919 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
1920 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1921 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1924 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1926 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1927 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1928 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1929 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1930 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5
1931 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
1932 ; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4
1933 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1934 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
1935 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1936 ; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1937 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
1938 ; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0
1939 ; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5
1940 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1941 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
1942 ; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1943 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1944 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1947 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1949 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1950 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1951 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1952 ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
1953 ; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1954 ; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
1955 ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
1956 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
1957 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
1958 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1959 ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1960 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1961 ; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1962 ; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1963 ; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
1964 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1965 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1966 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1967 ; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1968 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1971 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1973 ; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
1974 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1975 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1976 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1977 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1978 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1979 ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1980 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1981 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1982 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1983 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1984 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1985 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1986 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1987 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1988 ; AVX512F-NEXT: vzeroupper
1989 ; AVX512F-NEXT: retq
1991 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1992 ; AVX512VL: # %bb.0:
1993 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
1994 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1995 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1996 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1997 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1998 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1999 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2000 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
2001 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2002 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
2003 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
2004 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2005 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
2006 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2007 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2008 ; AVX512VL-NEXT: vzeroupper
2009 ; AVX512VL-NEXT: retq
2011 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
2012 ; AVX512BW: # %bb.0:
2013 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2014 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
2015 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2016 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2017 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2018 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
2019 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2020 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2021 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2022 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2023 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
2024 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
2025 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2026 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
2027 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2028 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2029 ; AVX512BW-NEXT: vzeroupper
2030 ; AVX512BW-NEXT: retq
2032 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
2033 ; AVX512VBMI2: # %bb.0:
2034 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2035 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2036 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2037 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2038 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2039 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
2040 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2041 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2042 ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2043 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2044 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
2045 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
2046 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2047 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
2048 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2049 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2050 ; AVX512VBMI2-NEXT: vzeroupper
2051 ; AVX512VBMI2-NEXT: retq
2053 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
2054 ; AVX512VLBW: # %bb.0:
2055 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
2056 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2057 ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2058 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2059 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
2060 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2061 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2062 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2063 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2064 ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2065 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
2066 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2067 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
2068 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2069 ; AVX512VLBW-NEXT: vzeroupper
2070 ; AVX512VLBW-NEXT: retq
2072 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
2073 ; AVX512VLVBMI2: # %bb.0:
2074 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2075 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2076 ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2077 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2078 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
2079 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2080 ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2081 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2082 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2083 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2084 ; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
2085 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2086 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
2087 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2088 ; AVX512VLVBMI2-NEXT: vzeroupper
2089 ; AVX512VLVBMI2-NEXT: retq
2091 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
2093 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2094 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2095 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2096 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2097 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2098 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2099 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2100 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2101 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
2102 ; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2103 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2104 ; XOPAVX1-NEXT: retq
2106 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
2108 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
2109 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2110 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2111 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2112 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2113 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2114 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2115 ; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2116 ; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
2117 ; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2118 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2119 ; XOPAVX2-NEXT: retq
2121 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
2123 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2124 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
2125 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
2126 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
2127 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2128 ; X32-SSE-NEXT: psubb %xmm3, %xmm4
2129 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
2130 ; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
2131 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
2132 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2133 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
2134 ; X32-SSE-NEXT: psrlw %xmm3, %xmm5
2135 ; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
2136 ; X32-SSE-NEXT: psrlw %xmm3, %xmm6
2137 ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
2138 ; X32-SSE-NEXT: psrlw $8, %xmm6
2139 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2140 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
2141 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
2142 ; X32-SSE-NEXT: pand %xmm5, %xmm6
2143 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
2144 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2145 ; X32-SSE-NEXT: psllw %xmm4, %xmm0
2146 ; X32-SSE-NEXT: psllw %xmm4, %xmm3
2147 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2148 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
2149 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
2150 ; X32-SSE-NEXT: pand %xmm0, %xmm3
2151 ; X32-SSE-NEXT: por %xmm6, %xmm3
2152 ; X32-SSE-NEXT: pand %xmm2, %xmm1
2153 ; X32-SSE-NEXT: pandn %xmm3, %xmm2
2154 ; X32-SSE-NEXT: por %xmm1, %xmm2
2155 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
2156 ; X32-SSE-NEXT: retl
2157 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
2158 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
2166 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2167 ; SSE2-LABEL: constant_funnnel_v2i64:
2169 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2170 ; SSE2-NEXT: psrlq $4, %xmm2
2171 ; SSE2-NEXT: psrlq $14, %xmm1
2172 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2173 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2174 ; SSE2-NEXT: psllq $60, %xmm2
2175 ; SSE2-NEXT: psllq $50, %xmm0
2176 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2177 ; SSE2-NEXT: orpd %xmm1, %xmm0
2180 ; SSE41-LABEL: constant_funnnel_v2i64:
2182 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2183 ; SSE41-NEXT: psrlq $14, %xmm2
2184 ; SSE41-NEXT: psrlq $4, %xmm1
2185 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2186 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2187 ; SSE41-NEXT: psllq $50, %xmm2
2188 ; SSE41-NEXT: psllq $60, %xmm0
2189 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2190 ; SSE41-NEXT: por %xmm1, %xmm0
2193 ; AVX1-LABEL: constant_funnnel_v2i64:
2195 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
2196 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
2197 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2198 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
2199 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
2200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2201 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2204 ; AVX2-LABEL: constant_funnnel_v2i64:
2206 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2207 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2208 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2211 ; AVX512F-LABEL: constant_funnnel_v2i64:
2213 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2214 ; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2215 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2216 ; AVX512F-NEXT: retq
2218 ; AVX512VL-LABEL: constant_funnnel_v2i64:
2219 ; AVX512VL: # %bb.0:
2220 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2221 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2222 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2223 ; AVX512VL-NEXT: retq
2225 ; AVX512BW-LABEL: constant_funnnel_v2i64:
2226 ; AVX512BW: # %bb.0:
2227 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2228 ; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2229 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2230 ; AVX512BW-NEXT: retq
2232 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
2233 ; AVX512VBMI2: # %bb.0:
2234 ; AVX512VBMI2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2235 ; AVX512VBMI2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2236 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2237 ; AVX512VBMI2-NEXT: retq
2239 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
2240 ; AVX512VLBW: # %bb.0:
2241 ; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2242 ; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2243 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2244 ; AVX512VLBW-NEXT: retq
2246 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
2247 ; AVX512VLVBMI2: # %bb.0:
2248 ; AVX512VLVBMI2-NEXT: vpshrdvq {{.*}}(%rip), %xmm0, %xmm1
2249 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2250 ; AVX512VLVBMI2-NEXT: retq
2252 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
2254 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
2255 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2256 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2257 ; XOPAVX1-NEXT: retq
2259 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
2261 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2262 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2263 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2264 ; XOPAVX2-NEXT: retq
2266 ; X32-SSE-LABEL: constant_funnnel_v2i64:
2268 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2269 ; X32-SSE-NEXT: psrlq $4, %xmm2
2270 ; X32-SSE-NEXT: psrlq $14, %xmm1
2271 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2272 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2273 ; X32-SSE-NEXT: psllq $60, %xmm2
2274 ; X32-SSE-NEXT: psllq $50, %xmm0
2275 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2276 ; X32-SSE-NEXT: orpd %xmm1, %xmm0
2277 ; X32-SSE-NEXT: retl
2278 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
2282 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2283 ; SSE2-LABEL: constant_funnnel_v4i32:
2285 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2286 ; SSE2-NEXT: psrld $7, %xmm2
2287 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2288 ; SSE2-NEXT: psrld $6, %xmm3
2289 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2290 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2291 ; SSE2-NEXT: psrld $5, %xmm2
2292 ; SSE2-NEXT: psrld $4, %xmm1
2293 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2294 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2295 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2296 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2297 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
2298 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2299 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2300 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
2301 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2302 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2303 ; SSE2-NEXT: por %xmm1, %xmm0
2306 ; SSE41-LABEL: constant_funnnel_v4i32:
2308 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2309 ; SSE41-NEXT: psrld $7, %xmm2
2310 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2311 ; SSE41-NEXT: psrld $5, %xmm3
2312 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2313 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2314 ; SSE41-NEXT: psrld $6, %xmm2
2315 ; SSE41-NEXT: psrld $4, %xmm1
2316 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2317 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2318 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
2319 ; SSE41-NEXT: por %xmm1, %xmm0
2322 ; AVX1-LABEL: constant_funnnel_v4i32:
2324 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
2325 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
2326 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2327 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
2328 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
2329 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
2330 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2331 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2332 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2335 ; AVX2-LABEL: constant_funnnel_v4i32:
2337 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2338 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2339 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2342 ; AVX512F-LABEL: constant_funnnel_v4i32:
2344 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2345 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2346 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2347 ; AVX512F-NEXT: retq
2349 ; AVX512VL-LABEL: constant_funnnel_v4i32:
2350 ; AVX512VL: # %bb.0:
2351 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2352 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2353 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2354 ; AVX512VL-NEXT: retq
2356 ; AVX512BW-LABEL: constant_funnnel_v4i32:
2357 ; AVX512BW: # %bb.0:
2358 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2359 ; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2360 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2361 ; AVX512BW-NEXT: retq
2363 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
2364 ; AVX512VBMI2: # %bb.0:
2365 ; AVX512VBMI2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2366 ; AVX512VBMI2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2367 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2368 ; AVX512VBMI2-NEXT: retq
2370 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
2371 ; AVX512VLBW: # %bb.0:
2372 ; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2373 ; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2374 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2375 ; AVX512VLBW-NEXT: retq
2377 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
2378 ; AVX512VLVBMI2: # %bb.0:
2379 ; AVX512VLVBMI2-NEXT: vpshrdvd {{.*}}(%rip), %xmm0, %xmm1
2380 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2381 ; AVX512VLVBMI2-NEXT: retq
2383 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
2385 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2386 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
2387 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2388 ; XOPAVX1-NEXT: retq
2390 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
2392 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2393 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2394 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2395 ; XOPAVX2-NEXT: retq
2397 ; X32-SSE-LABEL: constant_funnnel_v4i32:
2399 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2400 ; X32-SSE-NEXT: psrld $7, %xmm2
2401 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2402 ; X32-SSE-NEXT: psrld $6, %xmm3
2403 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2404 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2405 ; X32-SSE-NEXT: psrld $5, %xmm2
2406 ; X32-SSE-NEXT: psrld $4, %xmm1
2407 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2408 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2409 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2410 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2411 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
2412 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2413 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2414 ; X32-SSE-NEXT: pmuludq %xmm3, %xmm2
2415 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2416 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2417 ; X32-SSE-NEXT: por %xmm1, %xmm0
2418 ; X32-SSE-NEXT: retl
2419 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2423 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2424 ; SSE2-LABEL: constant_funnnel_v8i16:
2426 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2427 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2428 ; SSE2-NEXT: pmulhuw %xmm2, %xmm3
2429 ; SSE2-NEXT: pmullw %xmm2, %xmm0
2430 ; SSE2-NEXT: por %xmm3, %xmm0
2431 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2432 ; SSE2-NEXT: pand %xmm2, %xmm0
2433 ; SSE2-NEXT: pandn %xmm1, %xmm2
2434 ; SSE2-NEXT: por %xmm2, %xmm0
2437 ; SSE41-LABEL: constant_funnnel_v8i16:
2439 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2440 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2441 ; SSE41-NEXT: pmulhuw %xmm2, %xmm3
2442 ; SSE41-NEXT: pmullw %xmm2, %xmm0
2443 ; SSE41-NEXT: por %xmm3, %xmm0
2444 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2447 ; AVX-LABEL: constant_funnnel_v8i16:
2449 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2450 ; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2451 ; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2452 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
2453 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2456 ; AVX512F-LABEL: constant_funnnel_v8i16:
2458 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2459 ; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2460 ; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2461 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2462 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2463 ; AVX512F-NEXT: retq
2465 ; AVX512VL-LABEL: constant_funnnel_v8i16:
2466 ; AVX512VL: # %bb.0:
2467 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2468 ; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2469 ; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2470 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
2471 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2472 ; AVX512VL-NEXT: retq
2474 ; AVX512BW-LABEL: constant_funnnel_v8i16:
2475 ; AVX512BW: # %bb.0:
2476 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2477 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2478 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
2479 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2480 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,15,14,13,12,11,10,9>
2481 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2482 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
2483 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2484 ; AVX512BW-NEXT: vzeroupper
2485 ; AVX512BW-NEXT: retq
2487 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
2488 ; AVX512VBMI2: # %bb.0:
2489 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2490 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2491 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,1,2,3,4,5,6,7>
2492 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2493 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,15,14,13,12,11,10,9>
2494 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2495 ; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0
2496 ; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2497 ; AVX512VBMI2-NEXT: vzeroupper
2498 ; AVX512VBMI2-NEXT: retq
2500 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2501 ; AVX512VLBW: # %bb.0:
2502 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
2503 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
2504 ; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
2505 ; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2506 ; AVX512VLBW-NEXT: retq
2508 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2509 ; AVX512VLVBMI2: # %bb.0:
2510 ; AVX512VLVBMI2-NEXT: vpshrdvw {{.*}}(%rip), %xmm0, %xmm1
2511 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2512 ; AVX512VLVBMI2-NEXT: retq
2514 ; XOP-LABEL: constant_funnnel_v8i16:
2516 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
2517 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
2518 ; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2519 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2522 ; X32-SSE-LABEL: constant_funnnel_v8i16:
2524 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2525 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2526 ; X32-SSE-NEXT: pmulhuw %xmm2, %xmm3
2527 ; X32-SSE-NEXT: pmullw %xmm2, %xmm0
2528 ; X32-SSE-NEXT: por %xmm3, %xmm0
2529 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2530 ; X32-SSE-NEXT: pand %xmm2, %xmm0
2531 ; X32-SSE-NEXT: pandn %xmm1, %xmm2
2532 ; X32-SSE-NEXT: por %xmm2, %xmm0
2533 ; X32-SSE-NEXT: retl
2534 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2538 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2539 ; SSE2-LABEL: constant_funnnel_v16i8:
2541 ; SSE2-NEXT: pxor %xmm2, %xmm2
2542 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2543 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2544 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
2545 ; SSE2-NEXT: pmullw %xmm4, %xmm3
2546 ; SSE2-NEXT: psrlw $8, %xmm3
2547 ; SSE2-NEXT: movdqa %xmm1, %xmm5
2548 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2549 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = <u,128,64,32,16,8,4,2>
2550 ; SSE2-NEXT: pmullw %xmm2, %xmm5
2551 ; SSE2-NEXT: psrlw $8, %xmm5
2552 ; SSE2-NEXT: packuswb %xmm3, %xmm5
2553 ; SSE2-NEXT: movdqa %xmm0, %xmm3
2554 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2555 ; SSE2-NEXT: pmullw %xmm4, %xmm3
2556 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2557 ; SSE2-NEXT: pand %xmm4, %xmm3
2558 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2559 ; SSE2-NEXT: pmullw %xmm2, %xmm0
2560 ; SSE2-NEXT: pand %xmm4, %xmm0
2561 ; SSE2-NEXT: packuswb %xmm3, %xmm0
2562 ; SSE2-NEXT: por %xmm5, %xmm0
2563 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2564 ; SSE2-NEXT: pand %xmm2, %xmm0
2565 ; SSE2-NEXT: pandn %xmm1, %xmm2
2566 ; SSE2-NEXT: por %xmm2, %xmm0
2569 ; SSE41-LABEL: constant_funnnel_v16i8:
2571 ; SSE41-NEXT: pxor %xmm2, %xmm2
2572 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2573 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2574 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,2,4,8,16,32,64,128>
2575 ; SSE41-NEXT: pmullw %xmm2, %xmm3
2576 ; SSE41-NEXT: psrlw $8, %xmm3
2577 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2578 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,128,64,32,16,8,4,2>
2579 ; SSE41-NEXT: pmullw %xmm5, %xmm4
2580 ; SSE41-NEXT: psrlw $8, %xmm4
2581 ; SSE41-NEXT: packuswb %xmm3, %xmm4
2582 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2583 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2584 ; SSE41-NEXT: pmullw %xmm2, %xmm0
2585 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2586 ; SSE41-NEXT: pand %xmm2, %xmm0
2587 ; SSE41-NEXT: pmullw %xmm5, %xmm3
2588 ; SSE41-NEXT: pand %xmm2, %xmm3
2589 ; SSE41-NEXT: packuswb %xmm0, %xmm3
2590 ; SSE41-NEXT: por %xmm4, %xmm3
2591 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2592 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1
2593 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2596 ; AVX1-LABEL: constant_funnnel_v16i8:
2598 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2599 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2600 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,2,4,8,16,32,64,128>
2601 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
2602 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2603 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2604 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,128,64,32,16,8,4,2>
2605 ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
2606 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
2607 ; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
2608 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2609 ; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
2610 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2611 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
2612 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2613 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
2614 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2615 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2616 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2617 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2618 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2621 ; AVX2-LABEL: constant_funnnel_v16i8:
2623 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2624 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2625 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2626 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2627 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2628 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2629 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2630 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2631 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2632 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2633 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
2634 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2635 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2636 ; AVX2-NEXT: vzeroupper
2639 ; AVX512F-LABEL: constant_funnnel_v16i8:
2641 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2642 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2643 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2644 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2645 ; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
2646 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2647 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2648 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2649 ; AVX512F-NEXT: vzeroupper
2650 ; AVX512F-NEXT: retq
2652 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2653 ; AVX512VL: # %bb.0:
2654 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2655 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2656 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2657 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2658 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
2659 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2660 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2661 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2662 ; AVX512VL-NEXT: vzeroupper
2663 ; AVX512VL-NEXT: retq
2665 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2666 ; AVX512BW: # %bb.0:
2667 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
2668 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2669 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2670 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
2671 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2672 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2673 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
2674 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2675 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2676 ; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2677 ; AVX512BW-NEXT: vzeroupper
2678 ; AVX512BW-NEXT: retq
2680 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2681 ; AVX512VBMI2: # %bb.0:
2682 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,1,2,3,4,5,6,7,u,7,6,5,4,3,2,1>
2683 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2684 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2685 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,7,6,5,4,3,2,1,u,1,2,3,4,5,6,7>
2686 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2687 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2688 ; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2689 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2690 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2691 ; AVX512VBMI2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2692 ; AVX512VBMI2-NEXT: vzeroupper
2693 ; AVX512VBMI2-NEXT: retq
2695 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2696 ; AVX512VLBW: # %bb.0:
2697 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2698 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2699 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2700 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2701 ; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
2702 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2703 ; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
2704 ; AVX512VLBW-NEXT: kmovd %eax, %k1
2705 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2706 ; AVX512VLBW-NEXT: vzeroupper
2707 ; AVX512VLBW-NEXT: retq
2709 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2710 ; AVX512VLVBMI2: # %bb.0:
2711 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2712 ; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2713 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2714 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2715 ; AVX512VLVBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2716 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2717 ; AVX512VLVBMI2-NEXT: movw $257, %ax # imm = 0x101
2718 ; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
2719 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2720 ; AVX512VLVBMI2-NEXT: vzeroupper
2721 ; AVX512VLVBMI2-NEXT: retq
2723 ; XOP-LABEL: constant_funnnel_v16i8:
2725 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
2726 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2727 ; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2728 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2729 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2732 ; X32-SSE-LABEL: constant_funnnel_v16i8:
2734 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
2735 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2736 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2737 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,2,4,8,16,32,64,128>
2738 ; X32-SSE-NEXT: pmullw %xmm4, %xmm3
2739 ; X32-SSE-NEXT: psrlw $8, %xmm3
2740 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
2741 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2742 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = <u,128,64,32,16,8,4,2>
2743 ; X32-SSE-NEXT: pmullw %xmm2, %xmm5
2744 ; X32-SSE-NEXT: psrlw $8, %xmm5
2745 ; X32-SSE-NEXT: packuswb %xmm3, %xmm5
2746 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3
2747 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2748 ; X32-SSE-NEXT: pmullw %xmm4, %xmm3
2749 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2750 ; X32-SSE-NEXT: pand %xmm4, %xmm3
2751 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2752 ; X32-SSE-NEXT: pmullw %xmm2, %xmm0
2753 ; X32-SSE-NEXT: pand %xmm4, %xmm0
2754 ; X32-SSE-NEXT: packuswb %xmm3, %xmm0
2755 ; X32-SSE-NEXT: por %xmm5, %xmm0
2756 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2757 ; X32-SSE-NEXT: pand %xmm2, %xmm0
2758 ; X32-SSE-NEXT: pandn %xmm1, %xmm2
2759 ; X32-SSE-NEXT: por %xmm2, %xmm0
2760 ; X32-SSE-NEXT: retl
2761 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2766 ; Uniform Constant Shifts
2769 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2770 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2772 ; SSE-NEXT: psrlq $14, %xmm1
2773 ; SSE-NEXT: psllq $50, %xmm0
2774 ; SSE-NEXT: por %xmm1, %xmm0
2777 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2779 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2780 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2781 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2784 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2786 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2787 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2788 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2789 ; AVX512F-NEXT: retq
2791 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2792 ; AVX512VL: # %bb.0:
2793 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2794 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2795 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2796 ; AVX512VL-NEXT: retq
2798 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2799 ; AVX512BW: # %bb.0:
2800 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2801 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2802 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2803 ; AVX512BW-NEXT: retq
2805 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2806 ; AVX512VBMI2: # %bb.0:
2807 ; AVX512VBMI2-NEXT: vpsrlq $14, %xmm1, %xmm1
2808 ; AVX512VBMI2-NEXT: vpsllq $50, %xmm0, %xmm0
2809 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2810 ; AVX512VBMI2-NEXT: retq
2812 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2813 ; AVX512VLBW: # %bb.0:
2814 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2815 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2816 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2817 ; AVX512VLBW-NEXT: retq
2819 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2820 ; AVX512VLVBMI2: # %bb.0:
2821 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2822 ; AVX512VLVBMI2-NEXT: retq
2824 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2826 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2827 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2828 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2831 ; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
2833 ; X32-SSE-NEXT: psrlq $14, %xmm1
2834 ; X32-SSE-NEXT: psllq $50, %xmm0
2835 ; X32-SSE-NEXT: por %xmm1, %xmm0
2836 ; X32-SSE-NEXT: retl
2837 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2841 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2842 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2844 ; SSE-NEXT: psrld $4, %xmm1
2845 ; SSE-NEXT: pslld $28, %xmm0
2846 ; SSE-NEXT: por %xmm1, %xmm0
2849 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2851 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2852 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2853 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2856 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2858 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2859 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2860 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2861 ; AVX512F-NEXT: retq
2863 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2864 ; AVX512VL: # %bb.0:
2865 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2866 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2867 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2868 ; AVX512VL-NEXT: retq
2870 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2871 ; AVX512BW: # %bb.0:
2872 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2873 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2874 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2875 ; AVX512BW-NEXT: retq
2877 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2878 ; AVX512VBMI2: # %bb.0:
2879 ; AVX512VBMI2-NEXT: vpsrld $4, %xmm1, %xmm1
2880 ; AVX512VBMI2-NEXT: vpslld $28, %xmm0, %xmm0
2881 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2882 ; AVX512VBMI2-NEXT: retq
2884 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2885 ; AVX512VLBW: # %bb.0:
2886 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2887 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2888 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2889 ; AVX512VLBW-NEXT: retq
2891 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2892 ; AVX512VLVBMI2: # %bb.0:
2893 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2894 ; AVX512VLVBMI2-NEXT: retq
2896 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2898 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2899 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2900 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2903 ; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
2905 ; X32-SSE-NEXT: psrld $4, %xmm1
2906 ; X32-SSE-NEXT: pslld $28, %xmm0
2907 ; X32-SSE-NEXT: por %xmm1, %xmm0
2908 ; X32-SSE-NEXT: retl
2909 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2913 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2914 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2916 ; SSE-NEXT: psrlw $7, %xmm1
2917 ; SSE-NEXT: psllw $9, %xmm0
2918 ; SSE-NEXT: por %xmm1, %xmm0
2921 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2923 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2924 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2925 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2928 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2930 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2931 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2932 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2933 ; AVX512F-NEXT: retq
2935 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2936 ; AVX512VL: # %bb.0:
2937 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2938 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2939 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2940 ; AVX512VL-NEXT: retq
2942 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2943 ; AVX512BW: # %bb.0:
2944 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2945 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2946 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2947 ; AVX512BW-NEXT: retq
2949 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2950 ; AVX512VBMI2: # %bb.0:
2951 ; AVX512VBMI2-NEXT: vpsrlw $7, %xmm1, %xmm1
2952 ; AVX512VBMI2-NEXT: vpsllw $9, %xmm0, %xmm0
2953 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2954 ; AVX512VBMI2-NEXT: retq
2956 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2957 ; AVX512VLBW: # %bb.0:
2958 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2959 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2960 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2961 ; AVX512VLBW-NEXT: retq
2963 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2964 ; AVX512VLVBMI2: # %bb.0:
2965 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
2966 ; AVX512VLVBMI2-NEXT: retq
2968 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2970 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2971 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2972 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2975 ; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
2977 ; X32-SSE-NEXT: psrlw $7, %xmm1
2978 ; X32-SSE-NEXT: psllw $9, %xmm0
2979 ; X32-SSE-NEXT: por %xmm1, %xmm0
2980 ; X32-SSE-NEXT: retl
2981 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2985 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2986 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2988 ; SSE-NEXT: psrlw $4, %xmm1
2989 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
2990 ; SSE-NEXT: psllw $4, %xmm0
2991 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2992 ; SSE-NEXT: por %xmm1, %xmm0
2995 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2997 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2998 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2999 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
3000 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3001 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
3004 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
3006 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm1
3007 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3008 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
3009 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3010 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
3011 ; AVX512F-NEXT: retq
3013 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
3014 ; AVX512VL: # %bb.0:
3015 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
3016 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
3017 ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
3018 ; AVX512VL-NEXT: retq
3020 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
3021 ; AVX512BW: # %bb.0:
3022 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm1
3023 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3024 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
3025 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3026 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
3027 ; AVX512BW-NEXT: retq
3029 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
3030 ; AVX512VBMI2: # %bb.0:
3031 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm1
3032 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3033 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
3034 ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3035 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
3036 ; AVX512VBMI2-NEXT: retq
3038 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
3039 ; AVX512VLBW: # %bb.0:
3040 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
3041 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
3042 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
3043 ; AVX512VLBW-NEXT: retq
3045 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
3046 ; AVX512VLVBMI2: # %bb.0:
3047 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
3048 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
3049 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm2, %xmm0
3050 ; AVX512VLVBMI2-NEXT: retq
3052 ; XOP-LABEL: splatconstant_funnnel_v16i8:
3054 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
3055 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3056 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
3059 ; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
3061 ; X32-SSE-NEXT: psrlw $4, %xmm1
3062 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
3063 ; X32-SSE-NEXT: psllw $4, %xmm0
3064 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
3065 ; X32-SSE-NEXT: por %xmm1, %xmm0
3066 ; X32-SSE-NEXT: retl
3067 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)