1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
31 ; SSE2-NEXT: movdqa %xmm1, %xmm3
32 ; SSE2-NEXT: psrlq %xmm2, %xmm3
33 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
34 ; SSE2-NEXT: movdqa %xmm1, %xmm5
35 ; SSE2-NEXT: psrlq %xmm4, %xmm5
36 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
37 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
38 ; SSE2-NEXT: psubq %xmm2, %xmm3
39 ; SSE2-NEXT: movdqa %xmm0, %xmm4
40 ; SSE2-NEXT: psllq %xmm3, %xmm4
41 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
42 ; SSE2-NEXT: psllq %xmm3, %xmm0
43 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
44 ; SSE2-NEXT: orpd %xmm5, %xmm0
45 ; SSE2-NEXT: pxor %xmm3, %xmm3
46 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
47 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
48 ; SSE2-NEXT: pand %xmm3, %xmm2
49 ; SSE2-NEXT: pand %xmm2, %xmm1
50 ; SSE2-NEXT: pandn %xmm0, %xmm2
51 ; SSE2-NEXT: por %xmm1, %xmm2
52 ; SSE2-NEXT: movdqa %xmm2, %xmm0
55 ; SSE41-LABEL: var_funnnel_v2i64:
57 ; SSE41-NEXT: movdqa %xmm0, %xmm3
58 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
59 ; SSE41-NEXT: movdqa %xmm1, %xmm0
60 ; SSE41-NEXT: psrlq %xmm2, %xmm0
61 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
62 ; SSE41-NEXT: movdqa %xmm1, %xmm5
63 ; SSE41-NEXT: psrlq %xmm4, %xmm5
64 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
65 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
66 ; SSE41-NEXT: psubq %xmm2, %xmm0
67 ; SSE41-NEXT: movdqa %xmm3, %xmm4
68 ; SSE41-NEXT: psllq %xmm0, %xmm4
69 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
70 ; SSE41-NEXT: psllq %xmm0, %xmm3
71 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
72 ; SSE41-NEXT: por %xmm5, %xmm3
73 ; SSE41-NEXT: pxor %xmm0, %xmm0
74 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
75 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
76 ; SSE41-NEXT: movapd %xmm3, %xmm0
79 ; AVX1-LABEL: var_funnnel_v2i64:
81 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
82 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
83 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
84 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
85 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
86 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
87 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
88 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
89 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
90 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
91 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
92 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
93 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
94 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
95 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
98 ; AVX2-LABEL: var_funnnel_v2i64:
100 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
101 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
102 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
103 ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
104 ; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
105 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
106 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
107 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
108 ; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
111 ; AVX512F-LABEL: var_funnnel_v2i64:
113 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
114 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
115 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
116 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
117 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
118 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
119 ; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
120 ; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
121 ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
122 ; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
123 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
124 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
125 ; AVX512F-NEXT: vzeroupper
128 ; AVX512VL-LABEL: var_funnnel_v2i64:
130 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
131 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
132 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
133 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
134 ; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
135 ; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
136 ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
137 ; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
138 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
139 ; AVX512VL-NEXT: retq
141 ; AVX512BW-LABEL: var_funnnel_v2i64:
143 ; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
144 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
145 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
146 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
147 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
148 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
149 ; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
150 ; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
151 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
152 ; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
153 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
154 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
155 ; AVX512BW-NEXT: vzeroupper
156 ; AVX512BW-NEXT: retq
158 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
159 ; AVX512VBMI2: # %bb.0:
160 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
161 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
162 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
163 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
164 ; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
165 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
166 ; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
167 ; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
168 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
169 ; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
170 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
171 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
172 ; AVX512VBMI2-NEXT: vzeroupper
173 ; AVX512VBMI2-NEXT: retq
175 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
176 ; AVX512VLBW: # %bb.0:
177 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
178 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
179 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
180 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
181 ; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
182 ; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
183 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
184 ; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
185 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
186 ; AVX512VLBW-NEXT: retq
188 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
189 ; AVX512VLVBMI2: # %bb.0:
190 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
191 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
192 ; AVX512VLVBMI2-NEXT: retq
194 ; XOPAVX1-LABEL: var_funnnel_v2i64:
196 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
197 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
198 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
199 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
200 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
201 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
202 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
203 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
204 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
205 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
208 ; XOPAVX2-LABEL: var_funnnel_v2i64:
210 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
211 ; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
212 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
213 ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
214 ; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
215 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
216 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
217 ; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
218 ; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
221 ; X32-SSE-LABEL: var_funnnel_v2i64:
223 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
224 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
225 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
226 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
227 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
228 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5
229 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
230 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
231 ; X32-SSE-NEXT: psubq %xmm2, %xmm3
232 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
233 ; X32-SSE-NEXT: psllq %xmm3, %xmm4
234 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
235 ; X32-SSE-NEXT: psllq %xmm3, %xmm0
236 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
237 ; X32-SSE-NEXT: orpd %xmm5, %xmm0
238 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
239 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
240 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
241 ; X32-SSE-NEXT: pand %xmm3, %xmm2
242 ; X32-SSE-NEXT: pand %xmm2, %xmm1
243 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
244 ; X32-SSE-NEXT: por %xmm1, %xmm2
245 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
247 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
251 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
252 ; SSE2-LABEL: var_funnnel_v4i32:
254 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
255 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
256 ; SSE2-NEXT: movdqa %xmm1, %xmm4
257 ; SSE2-NEXT: psrld %xmm3, %xmm4
258 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
259 ; SSE2-NEXT: movdqa %xmm1, %xmm3
260 ; SSE2-NEXT: psrld %xmm5, %xmm3
261 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
262 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
263 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
264 ; SSE2-NEXT: movdqa %xmm1, %xmm6
265 ; SSE2-NEXT: psrld %xmm5, %xmm6
266 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
267 ; SSE2-NEXT: movdqa %xmm1, %xmm5
268 ; SSE2-NEXT: psrld %xmm4, %xmm5
269 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
270 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
271 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
272 ; SSE2-NEXT: psubd %xmm2, %xmm4
273 ; SSE2-NEXT: pslld $23, %xmm4
274 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
275 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
276 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
277 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
278 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
279 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
280 ; SSE2-NEXT: pmuludq %xmm5, %xmm0
281 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282 ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
283 ; SSE2-NEXT: por %xmm3, %xmm6
284 ; SSE2-NEXT: pxor %xmm0, %xmm0
285 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
286 ; SSE2-NEXT: pand %xmm0, %xmm1
287 ; SSE2-NEXT: pandn %xmm6, %xmm0
288 ; SSE2-NEXT: por %xmm1, %xmm0
291 ; SSE41-LABEL: var_funnnel_v4i32:
293 ; SSE41-NEXT: movdqa %xmm0, %xmm3
294 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
295 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
296 ; SSE41-NEXT: movdqa %xmm1, %xmm4
297 ; SSE41-NEXT: psrld %xmm0, %xmm4
298 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
299 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
300 ; SSE41-NEXT: movdqa %xmm1, %xmm6
301 ; SSE41-NEXT: psrld %xmm5, %xmm6
302 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
303 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
304 ; SSE41-NEXT: movdqa %xmm1, %xmm5
305 ; SSE41-NEXT: psrld %xmm4, %xmm5
306 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
307 ; SSE41-NEXT: movdqa %xmm1, %xmm4
308 ; SSE41-NEXT: psrld %xmm0, %xmm4
309 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
310 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
311 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
312 ; SSE41-NEXT: psubd %xmm2, %xmm0
313 ; SSE41-NEXT: pslld $23, %xmm0
314 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
315 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
316 ; SSE41-NEXT: pmulld %xmm0, %xmm3
317 ; SSE41-NEXT: por %xmm4, %xmm3
318 ; SSE41-NEXT: pxor %xmm0, %xmm0
319 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
320 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
321 ; SSE41-NEXT: movaps %xmm3, %xmm0
324 ; AVX1-LABEL: var_funnnel_v4i32:
326 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
327 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
328 ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
329 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
330 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
331 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
332 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
333 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
334 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
335 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
336 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
338 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
339 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
340 ; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
341 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
342 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
343 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
344 ; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
345 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
346 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
347 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
350 ; AVX2-LABEL: var_funnnel_v4i32:
352 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
353 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
354 ; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
355 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
356 ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
357 ; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
358 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
359 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
360 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
361 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
364 ; AVX512F-LABEL: var_funnnel_v4i32:
366 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
367 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
368 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
369 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
370 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
371 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
372 ; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
373 ; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
374 ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
375 ; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
376 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
377 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
378 ; AVX512F-NEXT: vzeroupper
381 ; AVX512VL-LABEL: var_funnnel_v4i32:
383 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
384 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
385 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
386 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
387 ; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
388 ; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
389 ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
390 ; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
391 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
392 ; AVX512VL-NEXT: retq
394 ; AVX512BW-LABEL: var_funnnel_v4i32:
396 ; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
397 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
398 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
399 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
400 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
401 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
402 ; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
403 ; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
404 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
405 ; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
406 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
407 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
408 ; AVX512BW-NEXT: vzeroupper
409 ; AVX512BW-NEXT: retq
411 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
412 ; AVX512VBMI2: # %bb.0:
413 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
414 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
415 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
416 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
417 ; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
418 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
419 ; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
420 ; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
421 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
422 ; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
423 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
424 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
425 ; AVX512VBMI2-NEXT: vzeroupper
426 ; AVX512VBMI2-NEXT: retq
428 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
429 ; AVX512VLBW: # %bb.0:
430 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
431 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
432 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
433 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
434 ; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
435 ; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
436 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
437 ; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
438 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
439 ; AVX512VLBW-NEXT: retq
441 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
442 ; AVX512VLVBMI2: # %bb.0:
443 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
444 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
445 ; AVX512VLVBMI2-NEXT: retq
447 ; XOPAVX1-LABEL: var_funnnel_v4i32:
449 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
450 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
451 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
452 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
453 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
454 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
455 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
456 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
457 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
458 ; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
461 ; XOPAVX2-LABEL: var_funnnel_v4i32:
463 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
464 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
465 ; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
466 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
467 ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
468 ; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
469 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
470 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
471 ; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
472 ; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
475 ; X32-SSE-LABEL: var_funnnel_v4i32:
477 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
478 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
479 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
480 ; X32-SSE-NEXT: psrld %xmm3, %xmm4
481 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
482 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
483 ; X32-SSE-NEXT: psrld %xmm5, %xmm3
484 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
485 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
486 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
487 ; X32-SSE-NEXT: movdqa %xmm1, %xmm6
488 ; X32-SSE-NEXT: psrld %xmm5, %xmm6
489 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
490 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
491 ; X32-SSE-NEXT: psrld %xmm4, %xmm5
492 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
493 ; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
494 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
495 ; X32-SSE-NEXT: psubd %xmm2, %xmm4
496 ; X32-SSE-NEXT: pslld $23, %xmm4
497 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
498 ; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
499 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
500 ; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
501 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
502 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
503 ; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
504 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
505 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
506 ; X32-SSE-NEXT: por %xmm3, %xmm6
507 ; X32-SSE-NEXT: pxor %xmm0, %xmm0
508 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
509 ; X32-SSE-NEXT: pand %xmm0, %xmm1
510 ; X32-SSE-NEXT: pandn %xmm6, %xmm0
511 ; X32-SSE-NEXT: por %xmm1, %xmm0
513 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
517 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
518 ; SSE2-LABEL: var_funnnel_v8i16:
520 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
521 ; SSE2-NEXT: movdqa %xmm2, %xmm4
522 ; SSE2-NEXT: psllw $12, %xmm4
523 ; SSE2-NEXT: movdqa %xmm4, %xmm3
524 ; SSE2-NEXT: psraw $15, %xmm3
525 ; SSE2-NEXT: movdqa %xmm1, %xmm5
526 ; SSE2-NEXT: psrlw $8, %xmm5
527 ; SSE2-NEXT: pand %xmm3, %xmm5
528 ; SSE2-NEXT: pandn %xmm1, %xmm3
529 ; SSE2-NEXT: por %xmm5, %xmm3
530 ; SSE2-NEXT: paddw %xmm4, %xmm4
531 ; SSE2-NEXT: movdqa %xmm4, %xmm5
532 ; SSE2-NEXT: psraw $15, %xmm5
533 ; SSE2-NEXT: movdqa %xmm5, %xmm6
534 ; SSE2-NEXT: pandn %xmm3, %xmm6
535 ; SSE2-NEXT: psrlw $4, %xmm3
536 ; SSE2-NEXT: pand %xmm5, %xmm3
537 ; SSE2-NEXT: por %xmm6, %xmm3
538 ; SSE2-NEXT: paddw %xmm4, %xmm4
539 ; SSE2-NEXT: movdqa %xmm4, %xmm5
540 ; SSE2-NEXT: psraw $15, %xmm5
541 ; SSE2-NEXT: movdqa %xmm5, %xmm6
542 ; SSE2-NEXT: pandn %xmm3, %xmm6
543 ; SSE2-NEXT: psrlw $2, %xmm3
544 ; SSE2-NEXT: pand %xmm5, %xmm3
545 ; SSE2-NEXT: por %xmm6, %xmm3
546 ; SSE2-NEXT: paddw %xmm4, %xmm4
547 ; SSE2-NEXT: psraw $15, %xmm4
548 ; SSE2-NEXT: movdqa %xmm4, %xmm5
549 ; SSE2-NEXT: pandn %xmm3, %xmm5
550 ; SSE2-NEXT: psrlw $1, %xmm3
551 ; SSE2-NEXT: pand %xmm4, %xmm3
552 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
553 ; SSE2-NEXT: psubw %xmm2, %xmm4
554 ; SSE2-NEXT: pxor %xmm8, %xmm8
555 ; SSE2-NEXT: movdqa %xmm4, %xmm7
556 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
557 ; SSE2-NEXT: pslld $23, %xmm7
558 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
559 ; SSE2-NEXT: paddd %xmm6, %xmm7
560 ; SSE2-NEXT: cvttps2dq %xmm7, %xmm7
561 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
562 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
563 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
564 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
565 ; SSE2-NEXT: pslld $23, %xmm4
566 ; SSE2-NEXT: paddd %xmm6, %xmm4
567 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
568 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
569 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
570 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
571 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
572 ; SSE2-NEXT: pmullw %xmm0, %xmm4
573 ; SSE2-NEXT: por %xmm5, %xmm4
574 ; SSE2-NEXT: por %xmm3, %xmm4
575 ; SSE2-NEXT: pcmpeqw %xmm8, %xmm2
576 ; SSE2-NEXT: pand %xmm2, %xmm1
577 ; SSE2-NEXT: pandn %xmm4, %xmm2
578 ; SSE2-NEXT: por %xmm1, %xmm2
579 ; SSE2-NEXT: movdqa %xmm2, %xmm0
582 ; SSE41-LABEL: var_funnnel_v8i16:
584 ; SSE41-NEXT: movdqa %xmm0, %xmm8
585 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
586 ; SSE41-NEXT: movdqa %xmm2, %xmm0
587 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
588 ; SSE41-NEXT: psubw %xmm2, %xmm5
589 ; SSE41-NEXT: pxor %xmm4, %xmm4
590 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
591 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
592 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
593 ; SSE41-NEXT: psllw $12, %xmm2
594 ; SSE41-NEXT: psllw $4, %xmm0
595 ; SSE41-NEXT: por %xmm2, %xmm0
596 ; SSE41-NEXT: movdqa %xmm0, %xmm2
597 ; SSE41-NEXT: paddw %xmm0, %xmm2
598 ; SSE41-NEXT: movdqa %xmm1, %xmm7
599 ; SSE41-NEXT: psrlw $8, %xmm7
600 ; SSE41-NEXT: movdqa %xmm1, %xmm3
601 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
602 ; SSE41-NEXT: movdqa %xmm3, %xmm7
603 ; SSE41-NEXT: psrlw $4, %xmm7
604 ; SSE41-NEXT: movdqa %xmm2, %xmm0
605 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
606 ; SSE41-NEXT: movdqa %xmm3, %xmm7
607 ; SSE41-NEXT: psrlw $2, %xmm7
608 ; SSE41-NEXT: paddw %xmm2, %xmm2
609 ; SSE41-NEXT: movdqa %xmm2, %xmm0
610 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
611 ; SSE41-NEXT: movdqa %xmm3, %xmm7
612 ; SSE41-NEXT: psrlw $1, %xmm7
613 ; SSE41-NEXT: paddw %xmm2, %xmm2
614 ; SSE41-NEXT: movdqa %xmm2, %xmm0
615 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
616 ; SSE41-NEXT: pslld $23, %xmm5
617 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
618 ; SSE41-NEXT: paddd %xmm0, %xmm5
619 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm2
620 ; SSE41-NEXT: pslld $23, %xmm6
621 ; SSE41-NEXT: paddd %xmm0, %xmm6
622 ; SSE41-NEXT: cvttps2dq %xmm6, %xmm0
623 ; SSE41-NEXT: packusdw %xmm2, %xmm0
624 ; SSE41-NEXT: pmullw %xmm0, %xmm8
625 ; SSE41-NEXT: por %xmm3, %xmm8
626 ; SSE41-NEXT: movdqa %xmm4, %xmm0
627 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm8
628 ; SSE41-NEXT: movdqa %xmm8, %xmm0
631 ; AVX1-LABEL: var_funnnel_v8i16:
633 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
634 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
635 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
636 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
637 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
638 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
639 ; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
640 ; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
641 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
642 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
643 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
644 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
645 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
646 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
647 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
648 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
649 ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
650 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
651 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
652 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
653 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
654 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
655 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
656 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
657 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
658 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
659 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
660 ; AVX1-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
661 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
662 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
663 ; AVX1-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2
664 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
667 ; AVX2-LABEL: var_funnnel_v8i16:
669 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
670 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
671 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
672 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
673 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
674 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
675 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
676 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
677 ; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5
678 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
679 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
680 ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm0
681 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
682 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
683 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
684 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
685 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
686 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
687 ; AVX2-NEXT: vzeroupper
690 ; AVX512F-LABEL: var_funnnel_v8i16:
692 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
693 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
694 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
695 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
696 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
697 ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
698 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
699 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
700 ; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
701 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
702 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
703 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
704 ; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
705 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
706 ; AVX512F-NEXT: vzeroupper
709 ; AVX512VL-LABEL: var_funnnel_v8i16:
711 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
712 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
713 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
714 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
715 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
716 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
717 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
718 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
719 ; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
720 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
721 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
722 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
723 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
724 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
725 ; AVX512VL-NEXT: vzeroupper
726 ; AVX512VL-NEXT: retq
728 ; AVX512BW-LABEL: var_funnnel_v8i16:
730 ; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
731 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
732 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
733 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
734 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
735 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
736 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
737 ; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
738 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
739 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
740 ; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
741 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
742 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
743 ; AVX512BW-NEXT: vzeroupper
744 ; AVX512BW-NEXT: retq
746 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
747 ; AVX512VBMI2: # %bb.0:
748 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
749 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
750 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
751 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
752 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
753 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
754 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
755 ; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
756 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
757 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
758 ; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
759 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
760 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
761 ; AVX512VBMI2-NEXT: vzeroupper
762 ; AVX512VBMI2-NEXT: retq
764 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
765 ; AVX512VLBW: # %bb.0:
766 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
767 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
768 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
769 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
770 ; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
771 ; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
772 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
773 ; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
774 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
775 ; AVX512VLBW-NEXT: retq
777 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
778 ; AVX512VLVBMI2: # %bb.0:
779 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
780 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
781 ; AVX512VLVBMI2-NEXT: retq
783 ; XOP-LABEL: var_funnnel_v8i16:
785 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
786 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
787 ; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
788 ; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
789 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
790 ; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
791 ; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
792 ; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
793 ; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
794 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
797 ; X32-SSE-LABEL: var_funnnel_v8i16:
799 ; X32-SSE-NEXT: subl $28, %esp
800 ; X32-SSE-NEXT: movups %xmm0, (%esp) # 16-byte Spill
801 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
802 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4
803 ; X32-SSE-NEXT: psllw $12, %xmm4
804 ; X32-SSE-NEXT: movdqa %xmm4, %xmm3
805 ; X32-SSE-NEXT: psraw $15, %xmm3
806 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
807 ; X32-SSE-NEXT: psrlw $8, %xmm5
808 ; X32-SSE-NEXT: pand %xmm3, %xmm5
809 ; X32-SSE-NEXT: pandn %xmm1, %xmm3
810 ; X32-SSE-NEXT: por %xmm5, %xmm3
811 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
812 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
813 ; X32-SSE-NEXT: psraw $15, %xmm5
814 ; X32-SSE-NEXT: movdqa %xmm5, %xmm6
815 ; X32-SSE-NEXT: pandn %xmm3, %xmm6
816 ; X32-SSE-NEXT: psrlw $4, %xmm3
817 ; X32-SSE-NEXT: pand %xmm5, %xmm3
818 ; X32-SSE-NEXT: por %xmm6, %xmm3
819 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
820 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
821 ; X32-SSE-NEXT: psraw $15, %xmm5
822 ; X32-SSE-NEXT: movdqa %xmm5, %xmm6
823 ; X32-SSE-NEXT: pandn %xmm3, %xmm6
824 ; X32-SSE-NEXT: psrlw $2, %xmm3
825 ; X32-SSE-NEXT: pand %xmm5, %xmm3
826 ; X32-SSE-NEXT: por %xmm6, %xmm3
827 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
828 ; X32-SSE-NEXT: psraw $15, %xmm4
829 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
830 ; X32-SSE-NEXT: pandn %xmm3, %xmm5
831 ; X32-SSE-NEXT: psrlw $1, %xmm3
832 ; X32-SSE-NEXT: pand %xmm4, %xmm3
833 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
834 ; X32-SSE-NEXT: psubw %xmm2, %xmm4
835 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
836 ; X32-SSE-NEXT: movdqa %xmm4, %xmm7
837 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
838 ; X32-SSE-NEXT: pslld $23, %xmm7
839 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
840 ; X32-SSE-NEXT: paddd %xmm0, %xmm7
841 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
842 ; X32-SSE-NEXT: pslld $23, %xmm4
843 ; X32-SSE-NEXT: paddd %xmm0, %xmm4
844 ; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm0
845 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
846 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
847 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
848 ; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
849 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
850 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
851 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
852 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
853 ; X32-SSE-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload
854 ; X32-SSE-NEXT: pmullw %xmm0, %xmm4
855 ; X32-SSE-NEXT: por %xmm5, %xmm4
856 ; X32-SSE-NEXT: por %xmm3, %xmm4
857 ; X32-SSE-NEXT: pcmpeqw %xmm6, %xmm2
858 ; X32-SSE-NEXT: pand %xmm2, %xmm1
859 ; X32-SSE-NEXT: pandn %xmm4, %xmm2
860 ; X32-SSE-NEXT: por %xmm1, %xmm2
861 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
862 ; X32-SSE-NEXT: addl $28, %esp
864 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
868 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
869 ; SSE2-LABEL: var_funnnel_v16i8:
871 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
872 ; SSE2-NEXT: movdqa %xmm2, %xmm5
873 ; SSE2-NEXT: psllw $5, %xmm5
874 ; SSE2-NEXT: pxor %xmm3, %xmm3
875 ; SSE2-NEXT: pxor %xmm6, %xmm6
876 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
877 ; SSE2-NEXT: movdqa %xmm1, %xmm4
878 ; SSE2-NEXT: psrlw $4, %xmm4
879 ; SSE2-NEXT: pand %xmm6, %xmm4
880 ; SSE2-NEXT: pandn %xmm1, %xmm6
881 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
882 ; SSE2-NEXT: por %xmm6, %xmm4
883 ; SSE2-NEXT: paddb %xmm5, %xmm5
884 ; SSE2-NEXT: pxor %xmm6, %xmm6
885 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
886 ; SSE2-NEXT: movdqa %xmm6, %xmm7
887 ; SSE2-NEXT: pandn %xmm4, %xmm7
888 ; SSE2-NEXT: psrlw $2, %xmm4
889 ; SSE2-NEXT: pand %xmm6, %xmm4
890 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
891 ; SSE2-NEXT: por %xmm7, %xmm4
892 ; SSE2-NEXT: paddb %xmm5, %xmm5
893 ; SSE2-NEXT: pxor %xmm6, %xmm6
894 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
895 ; SSE2-NEXT: movdqa %xmm6, %xmm5
896 ; SSE2-NEXT: pandn %xmm4, %xmm5
897 ; SSE2-NEXT: psrlw $1, %xmm4
898 ; SSE2-NEXT: pand %xmm6, %xmm4
899 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
900 ; SSE2-NEXT: por %xmm5, %xmm4
901 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
902 ; SSE2-NEXT: psubb %xmm2, %xmm5
903 ; SSE2-NEXT: psllw $5, %xmm5
904 ; SSE2-NEXT: pxor %xmm6, %xmm6
905 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
906 ; SSE2-NEXT: movdqa %xmm6, %xmm7
907 ; SSE2-NEXT: pandn %xmm0, %xmm7
908 ; SSE2-NEXT: psllw $4, %xmm0
909 ; SSE2-NEXT: pand %xmm6, %xmm0
910 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
911 ; SSE2-NEXT: por %xmm7, %xmm0
912 ; SSE2-NEXT: paddb %xmm5, %xmm5
913 ; SSE2-NEXT: pxor %xmm6, %xmm6
914 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
915 ; SSE2-NEXT: movdqa %xmm6, %xmm7
916 ; SSE2-NEXT: pandn %xmm0, %xmm7
917 ; SSE2-NEXT: psllw $2, %xmm0
918 ; SSE2-NEXT: pand %xmm6, %xmm0
919 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
920 ; SSE2-NEXT: por %xmm7, %xmm0
921 ; SSE2-NEXT: paddb %xmm5, %xmm5
922 ; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
923 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
924 ; SSE2-NEXT: movdqa %xmm3, %xmm5
925 ; SSE2-NEXT: pandn %xmm0, %xmm5
926 ; SSE2-NEXT: por %xmm4, %xmm5
927 ; SSE2-NEXT: paddb %xmm0, %xmm0
928 ; SSE2-NEXT: pand %xmm3, %xmm0
929 ; SSE2-NEXT: por %xmm5, %xmm0
930 ; SSE2-NEXT: pand %xmm2, %xmm1
931 ; SSE2-NEXT: pandn %xmm0, %xmm2
932 ; SSE2-NEXT: por %xmm1, %xmm2
933 ; SSE2-NEXT: movdqa %xmm2, %xmm0
936 ; SSE41-LABEL: var_funnnel_v16i8:
938 ; SSE41-NEXT: movdqa %xmm0, %xmm3
939 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
940 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
941 ; SSE41-NEXT: psubb %xmm2, %xmm4
942 ; SSE41-NEXT: pxor %xmm5, %xmm5
943 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
944 ; SSE41-NEXT: movdqa %xmm2, %xmm0
945 ; SSE41-NEXT: psllw $5, %xmm0
946 ; SSE41-NEXT: movdqa %xmm1, %xmm2
947 ; SSE41-NEXT: psrlw $4, %xmm2
948 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
949 ; SSE41-NEXT: movdqa %xmm1, %xmm6
950 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
951 ; SSE41-NEXT: movdqa %xmm6, %xmm2
952 ; SSE41-NEXT: psrlw $2, %xmm2
953 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
954 ; SSE41-NEXT: paddb %xmm0, %xmm0
955 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
956 ; SSE41-NEXT: movdqa %xmm6, %xmm2
957 ; SSE41-NEXT: psrlw $1, %xmm2
958 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
959 ; SSE41-NEXT: paddb %xmm0, %xmm0
960 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
961 ; SSE41-NEXT: psllw $5, %xmm4
962 ; SSE41-NEXT: movdqa %xmm4, %xmm2
963 ; SSE41-NEXT: paddb %xmm4, %xmm2
964 ; SSE41-NEXT: movdqa %xmm3, %xmm7
965 ; SSE41-NEXT: psllw $4, %xmm7
966 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
967 ; SSE41-NEXT: movdqa %xmm4, %xmm0
968 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
969 ; SSE41-NEXT: movdqa %xmm3, %xmm4
970 ; SSE41-NEXT: psllw $2, %xmm4
971 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
972 ; SSE41-NEXT: movdqa %xmm2, %xmm0
973 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
974 ; SSE41-NEXT: movdqa %xmm3, %xmm4
975 ; SSE41-NEXT: paddb %xmm3, %xmm4
976 ; SSE41-NEXT: paddb %xmm2, %xmm2
977 ; SSE41-NEXT: movdqa %xmm2, %xmm0
978 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
979 ; SSE41-NEXT: por %xmm6, %xmm3
980 ; SSE41-NEXT: movdqa %xmm5, %xmm0
981 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
982 ; SSE41-NEXT: movdqa %xmm3, %xmm0
985 ; AVX-LABEL: var_funnnel_v16i8:
987 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
988 ; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
989 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
990 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
991 ; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
992 ; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
993 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
994 ; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
995 ; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
996 ; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
997 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
998 ; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
999 ; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
1000 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1001 ; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1002 ; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
1003 ; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
1004 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
1005 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
1006 ; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
1007 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
1008 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
1009 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
1010 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
1011 ; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
1012 ; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
1013 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
1014 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
1015 ; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1016 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1019 ; AVX512F-LABEL: var_funnnel_v16i8:
1021 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1022 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1023 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1024 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1025 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1026 ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1027 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1028 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1029 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1030 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1031 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1032 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1033 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1034 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1035 ; AVX512F-NEXT: vzeroupper
1036 ; AVX512F-NEXT: retq
1038 ; AVX512VL-LABEL: var_funnnel_v16i8:
1039 ; AVX512VL: # %bb.0:
1040 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1041 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1042 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1043 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1044 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1045 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1046 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1047 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1048 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1049 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
1050 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1051 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1052 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1053 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1054 ; AVX512VL-NEXT: vzeroupper
1055 ; AVX512VL-NEXT: retq
1057 ; AVX512BW-LABEL: var_funnnel_v16i8:
1058 ; AVX512BW: # %bb.0:
1059 ; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1060 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1061 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1062 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1063 ; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
1064 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1065 ; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1066 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1067 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1068 ; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1069 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1070 ; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1071 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1072 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1073 ; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1074 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1075 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1076 ; AVX512BW-NEXT: vzeroupper
1077 ; AVX512BW-NEXT: retq
1079 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
1080 ; AVX512VBMI2: # %bb.0:
1081 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1082 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1083 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1084 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1085 ; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
1086 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1087 ; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1088 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1089 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1090 ; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1091 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1092 ; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1093 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
1094 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1095 ; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
1096 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1097 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1098 ; AVX512VBMI2-NEXT: vzeroupper
1099 ; AVX512VBMI2-NEXT: retq
1101 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
1102 ; AVX512VLBW: # %bb.0:
1103 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1104 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1105 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1106 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1107 ; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1108 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1109 ; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1110 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1111 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1112 ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1113 ; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1114 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1115 ; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1116 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1117 ; AVX512VLBW-NEXT: vzeroupper
1118 ; AVX512VLBW-NEXT: retq
1120 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
1121 ; AVX512VLVBMI2: # %bb.0:
1122 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1123 ; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1124 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1125 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1126 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1127 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1128 ; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1129 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1130 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1131 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1132 ; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
1133 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1134 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
1135 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1136 ; AVX512VLVBMI2-NEXT: vzeroupper
1137 ; AVX512VLVBMI2-NEXT: retq
1139 ; XOP-LABEL: var_funnnel_v16i8:
1141 ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1142 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1143 ; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1144 ; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1145 ; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1146 ; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1147 ; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1148 ; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
1149 ; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1150 ; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1153 ; X32-SSE-LABEL: var_funnnel_v16i8:
1155 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1156 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
1157 ; X32-SSE-NEXT: psllw $5, %xmm5
1158 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1159 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1160 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1161 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1162 ; X32-SSE-NEXT: psrlw $4, %xmm4
1163 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1164 ; X32-SSE-NEXT: pandn %xmm1, %xmm6
1165 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1166 ; X32-SSE-NEXT: por %xmm6, %xmm4
1167 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1168 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1169 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1170 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1171 ; X32-SSE-NEXT: pandn %xmm4, %xmm7
1172 ; X32-SSE-NEXT: psrlw $2, %xmm4
1173 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1174 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1175 ; X32-SSE-NEXT: por %xmm7, %xmm4
1176 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1177 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1178 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1179 ; X32-SSE-NEXT: movdqa %xmm6, %xmm5
1180 ; X32-SSE-NEXT: pandn %xmm4, %xmm5
1181 ; X32-SSE-NEXT: psrlw $1, %xmm4
1182 ; X32-SSE-NEXT: pand %xmm6, %xmm4
1183 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1184 ; X32-SSE-NEXT: por %xmm5, %xmm4
1185 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1186 ; X32-SSE-NEXT: psubb %xmm2, %xmm5
1187 ; X32-SSE-NEXT: psllw $5, %xmm5
1188 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1189 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1190 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1191 ; X32-SSE-NEXT: pandn %xmm0, %xmm7
1192 ; X32-SSE-NEXT: psllw $4, %xmm0
1193 ; X32-SSE-NEXT: pand %xmm6, %xmm0
1194 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1195 ; X32-SSE-NEXT: por %xmm7, %xmm0
1196 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1197 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
1198 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1199 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1200 ; X32-SSE-NEXT: pandn %xmm0, %xmm7
1201 ; X32-SSE-NEXT: psllw $2, %xmm0
1202 ; X32-SSE-NEXT: pand %xmm6, %xmm0
1203 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1204 ; X32-SSE-NEXT: por %xmm7, %xmm0
1205 ; X32-SSE-NEXT: paddb %xmm5, %xmm5
1206 ; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1207 ; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
1208 ; X32-SSE-NEXT: movdqa %xmm3, %xmm5
1209 ; X32-SSE-NEXT: pandn %xmm0, %xmm5
1210 ; X32-SSE-NEXT: por %xmm4, %xmm5
1211 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
1212 ; X32-SSE-NEXT: pand %xmm3, %xmm0
1213 ; X32-SSE-NEXT: por %xmm5, %xmm0
1214 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1215 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1216 ; X32-SSE-NEXT: por %xmm1, %xmm2
1217 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1218 ; X32-SSE-NEXT: retl
1219 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1224 ; Uniform Variable Shifts
1227 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1228 ; SSE2-LABEL: splatvar_funnnel_v2i64:
1230 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1231 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1232 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1233 ; SSE2-NEXT: psrlq %xmm2, %xmm3
1234 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1235 ; SSE2-NEXT: psubq %xmm2, %xmm4
1236 ; SSE2-NEXT: psllq %xmm4, %xmm0
1237 ; SSE2-NEXT: por %xmm3, %xmm0
1238 ; SSE2-NEXT: pxor %xmm3, %xmm3
1239 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
1240 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1241 ; SSE2-NEXT: pand %xmm3, %xmm2
1242 ; SSE2-NEXT: pand %xmm2, %xmm1
1243 ; SSE2-NEXT: pandn %xmm0, %xmm2
1244 ; SSE2-NEXT: por %xmm1, %xmm2
1245 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1248 ; SSE41-LABEL: splatvar_funnnel_v2i64:
1250 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1251 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1252 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1253 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1254 ; SSE41-NEXT: psrlq %xmm2, %xmm0
1255 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1256 ; SSE41-NEXT: psubq %xmm2, %xmm4
1257 ; SSE41-NEXT: psllq %xmm4, %xmm3
1258 ; SSE41-NEXT: por %xmm0, %xmm3
1259 ; SSE41-NEXT: pxor %xmm0, %xmm0
1260 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
1261 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1262 ; SSE41-NEXT: movapd %xmm3, %xmm0
1265 ; AVX1-LABEL: splatvar_funnnel_v2i64:
1267 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1268 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1269 ; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1270 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1271 ; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1272 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1273 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1274 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1275 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1276 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1279 ; AVX2-LABEL: splatvar_funnnel_v2i64:
1281 ; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1282 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1283 ; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1284 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1285 ; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1286 ; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1287 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1288 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1289 ; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1290 ; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1293 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1295 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1296 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
1297 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1298 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1299 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1300 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1301 ; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1302 ; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1303 ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1304 ; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
1305 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1306 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1307 ; AVX512F-NEXT: vzeroupper
1308 ; AVX512F-NEXT: retq
1310 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1311 ; AVX512VL: # %bb.0:
1312 ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
1313 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1314 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1315 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1316 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1317 ; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1318 ; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1319 ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1320 ; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
1321 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1322 ; AVX512VL-NEXT: retq
1324 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1325 ; AVX512BW: # %bb.0:
1326 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1327 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
1328 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1329 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1330 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1331 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1332 ; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1333 ; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1334 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1335 ; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
1336 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1337 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1338 ; AVX512BW-NEXT: vzeroupper
1339 ; AVX512BW-NEXT: retq
1341 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1342 ; AVX512VBMI2: # %bb.0:
1343 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1344 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1345 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1346 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1347 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1348 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1349 ; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1350 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1351 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1352 ; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
1353 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1354 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1355 ; AVX512VBMI2-NEXT: vzeroupper
1356 ; AVX512VBMI2-NEXT: retq
1358 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1359 ; AVX512VLBW: # %bb.0:
1360 ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
1361 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1362 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1363 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1364 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1365 ; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1366 ; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1367 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1368 ; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
1369 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1370 ; AVX512VLBW-NEXT: retq
1372 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1373 ; AVX512VLVBMI2: # %bb.0:
1374 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1375 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1376 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1377 ; AVX512VLVBMI2-NEXT: retq
1379 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1381 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1382 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1383 ; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1384 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1385 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1386 ; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1387 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1388 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1389 ; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1390 ; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1391 ; XOPAVX1-NEXT: retq
1393 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1395 ; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1396 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1397 ; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1398 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1399 ; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1400 ; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1401 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1402 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1403 ; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1404 ; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1405 ; XOPAVX2-NEXT: retq
1407 ; X32-SSE-LABEL: splatvar_funnnel_v2i64:
1409 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1410 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1411 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1412 ; X32-SSE-NEXT: psrlq %xmm2, %xmm3
1413 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
1414 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1415 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5
1416 ; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1417 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
1418 ; X32-SSE-NEXT: psubq %xmm2, %xmm3
1419 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1420 ; X32-SSE-NEXT: psllq %xmm3, %xmm4
1421 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1422 ; X32-SSE-NEXT: psllq %xmm3, %xmm0
1423 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1424 ; X32-SSE-NEXT: orpd %xmm5, %xmm0
1425 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1426 ; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
1427 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1428 ; X32-SSE-NEXT: pand %xmm3, %xmm2
1429 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1430 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1431 ; X32-SSE-NEXT: por %xmm1, %xmm2
1432 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1433 ; X32-SSE-NEXT: retl
1434 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1435 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1439 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1440 ; SSE2-LABEL: splatvar_funnnel_v4i32:
1442 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1443 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1444 ; SSE2-NEXT: pxor %xmm3, %xmm3
1445 ; SSE2-NEXT: xorps %xmm4, %xmm4
1446 ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1447 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1448 ; SSE2-NEXT: psrld %xmm4, %xmm5
1449 ; SSE2-NEXT: movd %xmm2, %eax
1450 ; SSE2-NEXT: movl $32, %ecx
1451 ; SSE2-NEXT: subl %eax, %ecx
1452 ; SSE2-NEXT: movd %ecx, %xmm4
1453 ; SSE2-NEXT: pslld %xmm4, %xmm0
1454 ; SSE2-NEXT: por %xmm5, %xmm0
1455 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
1456 ; SSE2-NEXT: pand %xmm2, %xmm1
1457 ; SSE2-NEXT: pandn %xmm0, %xmm2
1458 ; SSE2-NEXT: por %xmm1, %xmm2
1459 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1462 ; SSE41-LABEL: splatvar_funnnel_v4i32:
1464 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1465 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1466 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1467 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1468 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1469 ; SSE41-NEXT: psrld %xmm0, %xmm4
1470 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
1471 ; SSE41-NEXT: psubd %xmm2, %xmm0
1472 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1473 ; SSE41-NEXT: pslld %xmm0, %xmm3
1474 ; SSE41-NEXT: por %xmm4, %xmm3
1475 ; SSE41-NEXT: pxor %xmm0, %xmm0
1476 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
1477 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
1478 ; SSE41-NEXT: movaps %xmm3, %xmm0
1481 ; AVX1-LABEL: splatvar_funnnel_v4i32:
1483 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1484 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1485 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1486 ; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1487 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1488 ; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1489 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1490 ; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1491 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1492 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1493 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1494 ; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1497 ; AVX2-LABEL: splatvar_funnnel_v4i32:
1499 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1500 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1501 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1502 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1503 ; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1504 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1505 ; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1506 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1507 ; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1508 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1509 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1510 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1511 ; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1514 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1516 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1517 ; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
1518 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1519 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1520 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1521 ; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1522 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1523 ; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1524 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1525 ; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
1526 ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1527 ; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
1528 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1529 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1530 ; AVX512F-NEXT: vzeroupper
1531 ; AVX512F-NEXT: retq
1533 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1534 ; AVX512VL: # %bb.0:
1535 ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
1536 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1537 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1538 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1539 ; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1540 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1541 ; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1542 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1543 ; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
1544 ; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1545 ; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
1546 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1547 ; AVX512VL-NEXT: retq
1549 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1550 ; AVX512BW: # %bb.0:
1551 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1552 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
1553 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1554 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1555 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1556 ; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1557 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1558 ; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1559 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1560 ; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1561 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1562 ; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
1563 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1564 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1565 ; AVX512BW-NEXT: vzeroupper
1566 ; AVX512BW-NEXT: retq
1568 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1569 ; AVX512VBMI2: # %bb.0:
1570 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1571 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1572 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1573 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1574 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1575 ; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1576 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1577 ; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1578 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1579 ; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1580 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1581 ; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
1582 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1583 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1584 ; AVX512VBMI2-NEXT: vzeroupper
1585 ; AVX512VBMI2-NEXT: retq
1587 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1588 ; AVX512VLBW: # %bb.0:
1589 ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
1590 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1591 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1592 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1593 ; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1594 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1595 ; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1596 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1597 ; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1598 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1599 ; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
1600 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1601 ; AVX512VLBW-NEXT: retq
1603 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1604 ; AVX512VLVBMI2: # %bb.0:
1605 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1606 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1607 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1608 ; AVX512VLVBMI2-NEXT: retq
1610 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1612 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1613 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1614 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1615 ; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1616 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1617 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1618 ; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1619 ; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1620 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1621 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1622 ; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1623 ; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1624 ; XOPAVX1-NEXT: retq
1626 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1628 ; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1629 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1630 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1631 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1632 ; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1633 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1634 ; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1635 ; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1636 ; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1637 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1638 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1639 ; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1640 ; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1641 ; XOPAVX2-NEXT: retq
1643 ; X32-SSE-LABEL: splatvar_funnnel_v4i32:
1645 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1646 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1647 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1648 ; X32-SSE-NEXT: xorps %xmm4, %xmm4
1649 ; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1650 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1651 ; X32-SSE-NEXT: psrld %xmm4, %xmm5
1652 ; X32-SSE-NEXT: movd %xmm2, %eax
1653 ; X32-SSE-NEXT: movl $32, %ecx
1654 ; X32-SSE-NEXT: subl %eax, %ecx
1655 ; X32-SSE-NEXT: movd %ecx, %xmm4
1656 ; X32-SSE-NEXT: pslld %xmm4, %xmm0
1657 ; X32-SSE-NEXT: por %xmm5, %xmm0
1658 ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
1659 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1660 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1661 ; X32-SSE-NEXT: por %xmm1, %xmm2
1662 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1663 ; X32-SSE-NEXT: retl
1664 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1665 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1669 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1670 ; SSE2-LABEL: splatvar_funnnel_v8i16:
1672 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1673 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1674 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1675 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1676 ; SSE2-NEXT: psubw %xmm3, %xmm4
1677 ; SSE2-NEXT: pxor %xmm2, %xmm2
1678 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
1679 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1680 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1681 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1682 ; SSE2-NEXT: psrlw %xmm3, %xmm5
1683 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1684 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1685 ; SSE2-NEXT: psllw %xmm4, %xmm0
1686 ; SSE2-NEXT: por %xmm5, %xmm0
1687 ; SSE2-NEXT: pand %xmm2, %xmm1
1688 ; SSE2-NEXT: pandn %xmm0, %xmm2
1689 ; SSE2-NEXT: por %xmm1, %xmm2
1690 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1693 ; SSE41-LABEL: splatvar_funnnel_v8i16:
1695 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1696 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
1697 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
1698 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1699 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1700 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1701 ; SSE41-NEXT: psrlw %xmm0, %xmm4
1702 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
1703 ; SSE41-NEXT: psubw %xmm2, %xmm0
1704 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1705 ; SSE41-NEXT: psllw %xmm0, %xmm3
1706 ; SSE41-NEXT: por %xmm4, %xmm3
1707 ; SSE41-NEXT: pxor %xmm0, %xmm0
1708 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
1709 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1710 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1713 ; AVX1-LABEL: splatvar_funnnel_v8i16:
1715 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1716 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1717 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1718 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1719 ; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1720 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1721 ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1722 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1723 ; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1724 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1725 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1726 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1727 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1730 ; AVX2-LABEL: splatvar_funnnel_v8i16:
1732 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1733 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1734 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1735 ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1736 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1737 ; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1738 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1739 ; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1740 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1741 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1742 ; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1743 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1746 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1748 ; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
1749 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1750 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1751 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1752 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1753 ; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1754 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1755 ; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1756 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1757 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1758 ; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1759 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1760 ; AVX512F-NEXT: retq
1762 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1763 ; AVX512VL: # %bb.0:
1764 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
1765 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1766 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1767 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1768 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1769 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1770 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1771 ; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1772 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1773 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1774 ; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1775 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1776 ; AVX512VL-NEXT: retq
1778 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1779 ; AVX512BW: # %bb.0:
1780 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1781 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
1782 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1783 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1784 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1785 ; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1786 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1787 ; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1788 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1789 ; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1790 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1791 ; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
1792 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1793 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1794 ; AVX512BW-NEXT: vzeroupper
1795 ; AVX512BW-NEXT: retq
1797 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1798 ; AVX512VBMI2: # %bb.0:
1799 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1800 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1801 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1802 ; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1803 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1804 ; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1805 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1806 ; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1807 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1808 ; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1809 ; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1810 ; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
1811 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1812 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1813 ; AVX512VBMI2-NEXT: vzeroupper
1814 ; AVX512VBMI2-NEXT: retq
1816 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1817 ; AVX512VLBW: # %bb.0:
1818 ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
1819 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1820 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1821 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1822 ; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1823 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1824 ; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1825 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1826 ; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1827 ; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1828 ; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
1829 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
1830 ; AVX512VLBW-NEXT: retq
1832 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1833 ; AVX512VLVBMI2: # %bb.0:
1834 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1835 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1836 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1837 ; AVX512VLVBMI2-NEXT: retq
1839 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1841 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1842 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1843 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1844 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1845 ; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1846 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1847 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1848 ; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1849 ; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1850 ; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1851 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1852 ; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1853 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1854 ; XOPAVX1-NEXT: retq
1856 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1858 ; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1859 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1860 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1861 ; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1862 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1863 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1864 ; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1865 ; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1866 ; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1867 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1868 ; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1869 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1870 ; XOPAVX2-NEXT: retq
1872 ; X32-SSE-LABEL: splatvar_funnnel_v8i16:
1874 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1875 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1876 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1877 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1878 ; X32-SSE-NEXT: psubw %xmm3, %xmm4
1879 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
1880 ; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
1881 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1882 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1883 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1884 ; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1885 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1886 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1887 ; X32-SSE-NEXT: psllw %xmm4, %xmm0
1888 ; X32-SSE-NEXT: por %xmm5, %xmm0
1889 ; X32-SSE-NEXT: pand %xmm2, %xmm1
1890 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1891 ; X32-SSE-NEXT: por %xmm1, %xmm2
1892 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1893 ; X32-SSE-NEXT: retl
1894 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1895 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1899 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1900 ; SSE2-LABEL: splatvar_funnnel_v16i8:
1902 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1903 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1904 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1905 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1906 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1907 ; SSE2-NEXT: psubb %xmm3, %xmm4
1908 ; SSE2-NEXT: pxor %xmm2, %xmm2
1909 ; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
1910 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1911 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1912 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1913 ; SSE2-NEXT: psrlw %xmm3, %xmm5
1914 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1915 ; SSE2-NEXT: psrlw %xmm3, %xmm6
1916 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
1917 ; SSE2-NEXT: psrlw $8, %xmm6
1918 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1919 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1920 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1921 ; SSE2-NEXT: pand %xmm5, %xmm6
1922 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1923 ; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1924 ; SSE2-NEXT: psllw %xmm4, %xmm0
1925 ; SSE2-NEXT: psllw %xmm4, %xmm3
1926 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1927 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1928 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1929 ; SSE2-NEXT: pand %xmm0, %xmm3
1930 ; SSE2-NEXT: por %xmm6, %xmm3
1931 ; SSE2-NEXT: pand %xmm2, %xmm1
1932 ; SSE2-NEXT: pandn %xmm3, %xmm2
1933 ; SSE2-NEXT: por %xmm1, %xmm2
1934 ; SSE2-NEXT: movdqa %xmm2, %xmm0
1937 ; SSE41-LABEL: splatvar_funnnel_v16i8:
1939 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1940 ; SSE41-NEXT: pxor %xmm0, %xmm0
1941 ; SSE41-NEXT: pshufb %xmm0, %xmm2
1942 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1943 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1944 ; SSE41-NEXT: movdqa %xmm1, %xmm5
1945 ; SSE41-NEXT: psrlw %xmm4, %xmm5
1946 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1947 ; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
1948 ; SSE41-NEXT: psrlw %xmm4, %xmm7
1949 ; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1950 ; SSE41-NEXT: pand %xmm5, %xmm7
1951 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1952 ; SSE41-NEXT: psubb %xmm2, %xmm4
1953 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1954 ; SSE41-NEXT: psllw %xmm4, %xmm3
1955 ; SSE41-NEXT: psllw %xmm4, %xmm6
1956 ; SSE41-NEXT: pshufb %xmm0, %xmm6
1957 ; SSE41-NEXT: pand %xmm6, %xmm3
1958 ; SSE41-NEXT: por %xmm7, %xmm3
1959 ; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
1960 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1961 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1964 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1966 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1967 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1968 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1969 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1970 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5
1971 ; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
1972 ; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4
1973 ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1974 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
1975 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1976 ; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1977 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
1978 ; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0
1979 ; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5
1980 ; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1981 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
1982 ; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1983 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1984 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1987 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1989 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1990 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1991 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1992 ; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
1993 ; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1994 ; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
1995 ; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
1996 ; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
1997 ; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
1998 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1999 ; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2000 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
2001 ; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
2002 ; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
2003 ; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
2004 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
2005 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
2006 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2007 ; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2008 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2011 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
2013 ; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
2014 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2015 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2016 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2017 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
2018 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2019 ; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2020 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
2021 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2022 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
2023 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
2024 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2025 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
2026 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2027 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2028 ; AVX512F-NEXT: vzeroupper
2029 ; AVX512F-NEXT: retq
2031 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
2032 ; AVX512VL: # %bb.0:
2033 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
2034 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2035 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2036 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2037 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
2038 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2039 ; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2040 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
2041 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2042 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
2043 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
2044 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2045 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
2046 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2047 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2048 ; AVX512VL-NEXT: vzeroupper
2049 ; AVX512VL-NEXT: retq
2051 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
2052 ; AVX512BW: # %bb.0:
2053 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2054 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
2055 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2056 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2057 ; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
2058 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2059 ; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
2060 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2061 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2062 ; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
2063 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2064 ; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
2065 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
2066 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2067 ; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
2068 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2069 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2070 ; AVX512BW-NEXT: vzeroupper
2071 ; AVX512BW-NEXT: retq
2073 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
2074 ; AVX512VBMI2: # %bb.0:
2075 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2076 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2077 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2078 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2079 ; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
2080 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2081 ; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
2082 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2083 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2084 ; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
2085 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2086 ; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
2087 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
2088 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2089 ; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
2090 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2091 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2092 ; AVX512VBMI2-NEXT: vzeroupper
2093 ; AVX512VBMI2-NEXT: retq
2095 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
2096 ; AVX512VLBW: # %bb.0:
2097 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
2098 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2099 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
2100 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2101 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2102 ; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
2103 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2104 ; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
2105 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2106 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2107 ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2108 ; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
2109 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2110 ; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
2111 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2112 ; AVX512VLBW-NEXT: vzeroupper
2113 ; AVX512VLBW-NEXT: retq
2115 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
2116 ; AVX512VLVBMI2: # %bb.0:
2117 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2118 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2119 ; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
2120 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2121 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2122 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
2123 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2124 ; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
2125 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2126 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2127 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2128 ; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
2129 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2130 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
2131 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2132 ; AVX512VLVBMI2-NEXT: vzeroupper
2133 ; AVX512VLVBMI2-NEXT: retq
2135 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
2137 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2138 ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2139 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2140 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2141 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2142 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2143 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2144 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2145 ; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
2146 ; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2147 ; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2148 ; XOPAVX1-NEXT: retq
2150 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
2152 ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
2153 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2154 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2155 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2156 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2157 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2158 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2159 ; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2160 ; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
2161 ; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2162 ; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2163 ; XOPAVX2-NEXT: retq
2165 ; X32-SSE-LABEL: splatvar_funnnel_v16i8:
2167 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2168 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
2169 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
2170 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
2171 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2172 ; X32-SSE-NEXT: psubb %xmm3, %xmm4
2173 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
2174 ; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
2175 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
2176 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2177 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
2178 ; X32-SSE-NEXT: psrlw %xmm3, %xmm5
2179 ; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
2180 ; X32-SSE-NEXT: psrlw %xmm3, %xmm6
2181 ; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
2182 ; X32-SSE-NEXT: psrlw $8, %xmm6
2183 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2184 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
2185 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
2186 ; X32-SSE-NEXT: pand %xmm5, %xmm6
2187 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
2188 ; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2189 ; X32-SSE-NEXT: psllw %xmm4, %xmm0
2190 ; X32-SSE-NEXT: psllw %xmm4, %xmm3
2191 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2192 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
2193 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
2194 ; X32-SSE-NEXT: pand %xmm0, %xmm3
2195 ; X32-SSE-NEXT: por %xmm6, %xmm3
2196 ; X32-SSE-NEXT: pand %xmm2, %xmm1
2197 ; X32-SSE-NEXT: pandn %xmm3, %xmm2
2198 ; X32-SSE-NEXT: por %xmm1, %xmm2
2199 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
2200 ; X32-SSE-NEXT: retl
2201 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
2202 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
2210 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2211 ; SSE2-LABEL: constant_funnnel_v2i64:
2213 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2214 ; SSE2-NEXT: psrlq $4, %xmm2
2215 ; SSE2-NEXT: psrlq $14, %xmm1
2216 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2217 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2218 ; SSE2-NEXT: psllq $60, %xmm2
2219 ; SSE2-NEXT: psllq $50, %xmm0
2220 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2221 ; SSE2-NEXT: orpd %xmm1, %xmm0
2224 ; SSE41-LABEL: constant_funnnel_v2i64:
2226 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2227 ; SSE41-NEXT: psrlq $14, %xmm2
2228 ; SSE41-NEXT: psrlq $4, %xmm1
2229 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2230 ; SSE41-NEXT: movdqa %xmm0, %xmm2
2231 ; SSE41-NEXT: psllq $50, %xmm2
2232 ; SSE41-NEXT: psllq $60, %xmm0
2233 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2234 ; SSE41-NEXT: por %xmm1, %xmm0
2237 ; AVX1-LABEL: constant_funnnel_v2i64:
2239 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
2240 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
2241 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2242 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
2243 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
2244 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2245 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2248 ; AVX2-LABEL: constant_funnnel_v2i64:
2250 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2251 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2252 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2255 ; AVX512F-LABEL: constant_funnnel_v2i64:
2257 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2258 ; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2259 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2260 ; AVX512F-NEXT: retq
2262 ; AVX512VL-LABEL: constant_funnnel_v2i64:
2263 ; AVX512VL: # %bb.0:
2264 ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2265 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2266 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2267 ; AVX512VL-NEXT: retq
2269 ; AVX512BW-LABEL: constant_funnnel_v2i64:
2270 ; AVX512BW: # %bb.0:
2271 ; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2272 ; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2273 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2274 ; AVX512BW-NEXT: retq
2276 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
2277 ; AVX512VBMI2: # %bb.0:
2278 ; AVX512VBMI2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2279 ; AVX512VBMI2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2280 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2281 ; AVX512VBMI2-NEXT: retq
2283 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
2284 ; AVX512VLBW: # %bb.0:
2285 ; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2286 ; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2287 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2288 ; AVX512VLBW-NEXT: retq
2290 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
2291 ; AVX512VLVBMI2: # %bb.0:
2292 ; AVX512VLVBMI2-NEXT: vpshrdvq {{.*}}(%rip), %xmm0, %xmm1
2293 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2294 ; AVX512VLVBMI2-NEXT: retq
2296 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
2298 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
2299 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2300 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2301 ; XOPAVX1-NEXT: retq
2303 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
2305 ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2306 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2307 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2308 ; XOPAVX2-NEXT: retq
2310 ; X32-SSE-LABEL: constant_funnnel_v2i64:
2312 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2313 ; X32-SSE-NEXT: psrlq $4, %xmm2
2314 ; X32-SSE-NEXT: psrlq $14, %xmm1
2315 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2316 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2317 ; X32-SSE-NEXT: psllq $60, %xmm2
2318 ; X32-SSE-NEXT: psllq $50, %xmm0
2319 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2320 ; X32-SSE-NEXT: orpd %xmm1, %xmm0
2321 ; X32-SSE-NEXT: retl
2322 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
2326 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2327 ; SSE2-LABEL: constant_funnnel_v4i32:
2329 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2330 ; SSE2-NEXT: psrld $7, %xmm2
2331 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2332 ; SSE2-NEXT: psrld $6, %xmm3
2333 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2334 ; SSE2-NEXT: movdqa %xmm1, %xmm2
2335 ; SSE2-NEXT: psrld $5, %xmm2
2336 ; SSE2-NEXT: psrld $4, %xmm1
2337 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2338 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2339 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2340 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2341 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
2342 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2343 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2344 ; SSE2-NEXT: pmuludq %xmm3, %xmm2
2345 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2346 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2347 ; SSE2-NEXT: por %xmm1, %xmm0
2350 ; SSE41-LABEL: constant_funnnel_v4i32:
2352 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2353 ; SSE41-NEXT: psrld $7, %xmm2
2354 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2355 ; SSE41-NEXT: psrld $5, %xmm3
2356 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2357 ; SSE41-NEXT: movdqa %xmm1, %xmm2
2358 ; SSE41-NEXT: psrld $6, %xmm2
2359 ; SSE41-NEXT: psrld $4, %xmm1
2360 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2361 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2362 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
2363 ; SSE41-NEXT: por %xmm1, %xmm0
2366 ; AVX1-LABEL: constant_funnnel_v4i32:
2368 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
2369 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
2370 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2371 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
2372 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
2373 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
2374 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2375 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2376 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2379 ; AVX2-LABEL: constant_funnnel_v4i32:
2381 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2382 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2383 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2386 ; AVX512F-LABEL: constant_funnnel_v4i32:
2388 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2389 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2390 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2391 ; AVX512F-NEXT: retq
2393 ; AVX512VL-LABEL: constant_funnnel_v4i32:
2394 ; AVX512VL: # %bb.0:
2395 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2396 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2397 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2398 ; AVX512VL-NEXT: retq
2400 ; AVX512BW-LABEL: constant_funnnel_v4i32:
2401 ; AVX512BW: # %bb.0:
2402 ; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2403 ; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2404 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2405 ; AVX512BW-NEXT: retq
2407 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
2408 ; AVX512VBMI2: # %bb.0:
2409 ; AVX512VBMI2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2410 ; AVX512VBMI2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2411 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2412 ; AVX512VBMI2-NEXT: retq
2414 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
2415 ; AVX512VLBW: # %bb.0:
2416 ; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2417 ; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2418 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2419 ; AVX512VLBW-NEXT: retq
2421 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
2422 ; AVX512VLVBMI2: # %bb.0:
2423 ; AVX512VLVBMI2-NEXT: vpshrdvd {{.*}}(%rip), %xmm0, %xmm1
2424 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2425 ; AVX512VLVBMI2-NEXT: retq
2427 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
2429 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2430 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
2431 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2432 ; XOPAVX1-NEXT: retq
2434 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
2436 ; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2437 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2438 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2439 ; XOPAVX2-NEXT: retq
2441 ; X32-SSE-LABEL: constant_funnnel_v4i32:
2443 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2444 ; X32-SSE-NEXT: psrld $7, %xmm2
2445 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2446 ; X32-SSE-NEXT: psrld $6, %xmm3
2447 ; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2448 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2449 ; X32-SSE-NEXT: psrld $5, %xmm2
2450 ; X32-SSE-NEXT: psrld $4, %xmm1
2451 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2452 ; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2453 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2454 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2455 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
2456 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2457 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2458 ; X32-SSE-NEXT: pmuludq %xmm3, %xmm2
2459 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2460 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2461 ; X32-SSE-NEXT: por %xmm1, %xmm0
2462 ; X32-SSE-NEXT: retl
2463 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2467 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2468 ; SSE2-LABEL: constant_funnnel_v8i16:
2470 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2471 ; SSE2-NEXT: movdqa %xmm2, %xmm3
2472 ; SSE2-NEXT: pandn %xmm1, %xmm3
2473 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2474 ; SSE2-NEXT: pmulhuw %xmm4, %xmm1
2475 ; SSE2-NEXT: pand %xmm2, %xmm1
2476 ; SSE2-NEXT: pmullw %xmm4, %xmm0
2477 ; SSE2-NEXT: por %xmm3, %xmm0
2478 ; SSE2-NEXT: por %xmm1, %xmm0
2479 ; SSE2-NEXT: pand %xmm2, %xmm0
2480 ; SSE2-NEXT: por %xmm3, %xmm0
2483 ; SSE41-LABEL: constant_funnnel_v8i16:
2485 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2486 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2487 ; SSE41-NEXT: pmulhuw %xmm2, %xmm3
2488 ; SSE41-NEXT: pmullw %xmm2, %xmm0
2489 ; SSE41-NEXT: por %xmm3, %xmm0
2490 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2493 ; AVX-LABEL: constant_funnnel_v8i16:
2495 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2496 ; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2497 ; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2498 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
2499 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2502 ; AVX512F-LABEL: constant_funnnel_v8i16:
2504 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2505 ; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2506 ; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2507 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2508 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2509 ; AVX512F-NEXT: retq
2511 ; AVX512VL-LABEL: constant_funnnel_v8i16:
2512 ; AVX512VL: # %bb.0:
2513 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2514 ; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2515 ; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2516 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
2517 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2518 ; AVX512VL-NEXT: retq
2520 ; AVX512BW-LABEL: constant_funnnel_v8i16:
2521 ; AVX512BW: # %bb.0:
2522 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2523 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2524 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2525 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2526 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2527 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2528 ; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
2529 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2530 ; AVX512BW-NEXT: vzeroupper
2531 ; AVX512BW-NEXT: retq
2533 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
2534 ; AVX512VBMI2: # %bb.0:
2535 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2536 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2537 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2538 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2539 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2540 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2541 ; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0
2542 ; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2543 ; AVX512VBMI2-NEXT: vzeroupper
2544 ; AVX512VBMI2-NEXT: retq
2546 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2547 ; AVX512VLBW: # %bb.0:
2548 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
2549 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
2550 ; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
2551 ; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2552 ; AVX512VLBW-NEXT: retq
2554 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2555 ; AVX512VLVBMI2: # %bb.0:
2556 ; AVX512VLVBMI2-NEXT: vpshrdvw {{.*}}(%rip), %xmm0, %xmm1
2557 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2558 ; AVX512VLVBMI2-NEXT: retq
2560 ; XOP-LABEL: constant_funnnel_v8i16:
2562 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
2563 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
2564 ; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2565 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2568 ; X32-SSE-LABEL: constant_funnnel_v8i16:
2570 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2571 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
2572 ; X32-SSE-NEXT: pandn %xmm1, %xmm3
2573 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2574 ; X32-SSE-NEXT: pmulhuw %xmm4, %xmm1
2575 ; X32-SSE-NEXT: pand %xmm2, %xmm1
2576 ; X32-SSE-NEXT: pmullw %xmm4, %xmm0
2577 ; X32-SSE-NEXT: por %xmm3, %xmm0
2578 ; X32-SSE-NEXT: por %xmm1, %xmm0
2579 ; X32-SSE-NEXT: pand %xmm2, %xmm0
2580 ; X32-SSE-NEXT: por %xmm3, %xmm0
2581 ; X32-SSE-NEXT: retl
2582 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2586 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2587 ; SSE2-LABEL: constant_funnnel_v16i8:
2589 ; SSE2-NEXT: pxor %xmm2, %xmm2
2590 ; SSE2-NEXT: movdqa %xmm1, %xmm3
2591 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2592 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2593 ; SSE2-NEXT: psrlw $8, %xmm3
2594 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2595 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2596 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm4
2597 ; SSE2-NEXT: psrlw $8, %xmm4
2598 ; SSE2-NEXT: packuswb %xmm3, %xmm4
2599 ; SSE2-NEXT: movdqa %xmm0, %xmm2
2600 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2601 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2602 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2603 ; SSE2-NEXT: pand %xmm3, %xmm2
2604 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2605 ; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
2606 ; SSE2-NEXT: pand %xmm3, %xmm0
2607 ; SSE2-NEXT: packuswb %xmm2, %xmm0
2608 ; SSE2-NEXT: por %xmm4, %xmm0
2609 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2610 ; SSE2-NEXT: pand %xmm2, %xmm0
2611 ; SSE2-NEXT: pandn %xmm1, %xmm2
2612 ; SSE2-NEXT: por %xmm2, %xmm0
2615 ; SSE41-LABEL: constant_funnnel_v16i8:
2617 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2618 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2619 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2620 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2621 ; SSE41-NEXT: pand %xmm3, %xmm0
2622 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2623 ; SSE41-NEXT: pand %xmm3, %xmm2
2624 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2625 ; SSE41-NEXT: pxor %xmm0, %xmm0
2626 ; SSE41-NEXT: movdqa %xmm1, %xmm3
2627 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2628 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3
2629 ; SSE41-NEXT: psrlw $8, %xmm3
2630 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2631 ; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4
2632 ; SSE41-NEXT: psrlw $8, %xmm4
2633 ; SSE41-NEXT: packuswb %xmm3, %xmm4
2634 ; SSE41-NEXT: por %xmm2, %xmm4
2635 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2636 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
2637 ; SSE41-NEXT: movdqa %xmm1, %xmm0
2640 ; AVX1-LABEL: constant_funnnel_v16i8:
2642 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2643 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2644 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2645 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2646 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2647 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2648 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2649 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2650 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2651 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2652 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2653 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2654 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2655 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
2656 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2657 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2658 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2659 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2660 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2663 ; AVX2-LABEL: constant_funnnel_v16i8:
2665 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2666 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2667 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2668 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2669 ; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2670 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2671 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2672 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2673 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2674 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2675 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
2676 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2677 ; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2678 ; AVX2-NEXT: vzeroupper
2681 ; AVX512F-LABEL: constant_funnnel_v16i8:
2683 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2684 ; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2685 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2686 ; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2687 ; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
2688 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2689 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2690 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2691 ; AVX512F-NEXT: vzeroupper
2692 ; AVX512F-NEXT: retq
2694 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2695 ; AVX512VL: # %bb.0:
2696 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2697 ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2698 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2699 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2700 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
2701 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2702 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2703 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2704 ; AVX512VL-NEXT: vzeroupper
2705 ; AVX512VL-NEXT: retq
2707 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2708 ; AVX512BW: # %bb.0:
2709 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2710 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2711 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2712 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2713 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2714 ; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2715 ; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
2716 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2717 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2718 ; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2719 ; AVX512BW-NEXT: vzeroupper
2720 ; AVX512BW-NEXT: retq
2722 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2723 ; AVX512VBMI2: # %bb.0:
2724 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2725 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2726 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2727 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2728 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2729 ; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2730 ; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2731 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2732 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2733 ; AVX512VBMI2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2734 ; AVX512VBMI2-NEXT: vzeroupper
2735 ; AVX512VBMI2-NEXT: retq
2737 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2738 ; AVX512VLBW: # %bb.0:
2739 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2740 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2741 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2742 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2743 ; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
2744 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2745 ; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
2746 ; AVX512VLBW-NEXT: kmovd %eax, %k1
2747 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2748 ; AVX512VLBW-NEXT: vzeroupper
2749 ; AVX512VLBW-NEXT: retq
2751 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2752 ; AVX512VLVBMI2: # %bb.0:
2753 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2754 ; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2755 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2756 ; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2757 ; AVX512VLVBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2758 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2759 ; AVX512VLVBMI2-NEXT: movw $257, %ax # imm = 0x101
2760 ; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
2761 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2762 ; AVX512VLVBMI2-NEXT: vzeroupper
2763 ; AVX512VLVBMI2-NEXT: retq
2765 ; XOP-LABEL: constant_funnnel_v16i8:
2767 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
2768 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2769 ; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2770 ; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2771 ; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2774 ; X32-SSE-LABEL: constant_funnnel_v16i8:
2776 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
2777 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2778 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2779 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
2780 ; X32-SSE-NEXT: psrlw $8, %xmm3
2781 ; X32-SSE-NEXT: movdqa %xmm1, %xmm4
2782 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2783 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm4
2784 ; X32-SSE-NEXT: psrlw $8, %xmm4
2785 ; X32-SSE-NEXT: packuswb %xmm3, %xmm4
2786 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2787 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2788 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
2789 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2790 ; X32-SSE-NEXT: pand %xmm3, %xmm2
2791 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2792 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
2793 ; X32-SSE-NEXT: pand %xmm3, %xmm0
2794 ; X32-SSE-NEXT: packuswb %xmm2, %xmm0
2795 ; X32-SSE-NEXT: por %xmm4, %xmm0
2796 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2797 ; X32-SSE-NEXT: pand %xmm2, %xmm0
2798 ; X32-SSE-NEXT: pandn %xmm1, %xmm2
2799 ; X32-SSE-NEXT: por %xmm2, %xmm0
2800 ; X32-SSE-NEXT: retl
2801 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2806 ; Uniform Constant Shifts
2809 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2810 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2812 ; SSE-NEXT: psrlq $14, %xmm1
2813 ; SSE-NEXT: psllq $50, %xmm0
2814 ; SSE-NEXT: por %xmm1, %xmm0
2817 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2819 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2820 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2821 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2824 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2826 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2827 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2828 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2829 ; AVX512F-NEXT: retq
2831 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2832 ; AVX512VL: # %bb.0:
2833 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2834 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2835 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2836 ; AVX512VL-NEXT: retq
2838 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2839 ; AVX512BW: # %bb.0:
2840 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2841 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2842 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2843 ; AVX512BW-NEXT: retq
2845 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2846 ; AVX512VBMI2: # %bb.0:
2847 ; AVX512VBMI2-NEXT: vpsrlq $14, %xmm1, %xmm1
2848 ; AVX512VBMI2-NEXT: vpsllq $50, %xmm0, %xmm0
2849 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2850 ; AVX512VBMI2-NEXT: retq
2852 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2853 ; AVX512VLBW: # %bb.0:
2854 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2855 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2856 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2857 ; AVX512VLBW-NEXT: retq
2859 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2860 ; AVX512VLVBMI2: # %bb.0:
2861 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2862 ; AVX512VLVBMI2-NEXT: retq
2864 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2866 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2867 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2868 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2871 ; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
2873 ; X32-SSE-NEXT: psrlq $14, %xmm1
2874 ; X32-SSE-NEXT: psllq $50, %xmm0
2875 ; X32-SSE-NEXT: por %xmm1, %xmm0
2876 ; X32-SSE-NEXT: retl
2877 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2881 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2882 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2884 ; SSE-NEXT: psrld $4, %xmm1
2885 ; SSE-NEXT: pslld $28, %xmm0
2886 ; SSE-NEXT: por %xmm1, %xmm0
2889 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2891 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2892 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2893 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2896 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2898 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2899 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2900 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2901 ; AVX512F-NEXT: retq
2903 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2904 ; AVX512VL: # %bb.0:
2905 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2906 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2907 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2908 ; AVX512VL-NEXT: retq
2910 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2911 ; AVX512BW: # %bb.0:
2912 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2913 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2914 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2915 ; AVX512BW-NEXT: retq
2917 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2918 ; AVX512VBMI2: # %bb.0:
2919 ; AVX512VBMI2-NEXT: vpsrld $4, %xmm1, %xmm1
2920 ; AVX512VBMI2-NEXT: vpslld $28, %xmm0, %xmm0
2921 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2922 ; AVX512VBMI2-NEXT: retq
2924 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2925 ; AVX512VLBW: # %bb.0:
2926 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2927 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2928 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2929 ; AVX512VLBW-NEXT: retq
2931 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2932 ; AVX512VLVBMI2: # %bb.0:
2933 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2934 ; AVX512VLVBMI2-NEXT: retq
2936 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2938 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2939 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2940 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2943 ; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
2945 ; X32-SSE-NEXT: psrld $4, %xmm1
2946 ; X32-SSE-NEXT: pslld $28, %xmm0
2947 ; X32-SSE-NEXT: por %xmm1, %xmm0
2948 ; X32-SSE-NEXT: retl
2949 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2953 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2954 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2956 ; SSE-NEXT: psrlw $7, %xmm1
2957 ; SSE-NEXT: psllw $9, %xmm0
2958 ; SSE-NEXT: por %xmm1, %xmm0
2961 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2963 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2964 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2965 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2968 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2970 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2971 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2972 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2973 ; AVX512F-NEXT: retq
2975 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2976 ; AVX512VL: # %bb.0:
2977 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2978 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2979 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2980 ; AVX512VL-NEXT: retq
2982 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2983 ; AVX512BW: # %bb.0:
2984 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2985 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2986 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2987 ; AVX512BW-NEXT: retq
2989 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2990 ; AVX512VBMI2: # %bb.0:
2991 ; AVX512VBMI2-NEXT: vpsrlw $7, %xmm1, %xmm1
2992 ; AVX512VBMI2-NEXT: vpsllw $9, %xmm0, %xmm0
2993 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2994 ; AVX512VBMI2-NEXT: retq
2996 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2997 ; AVX512VLBW: # %bb.0:
2998 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2999 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
3000 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
3001 ; AVX512VLBW-NEXT: retq
3003 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
3004 ; AVX512VLVBMI2: # %bb.0:
3005 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
3006 ; AVX512VLVBMI2-NEXT: retq
3008 ; XOP-LABEL: splatconstant_funnnel_v8i16:
3010 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
3011 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
3012 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
3015 ; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
3017 ; X32-SSE-NEXT: psrlw $7, %xmm1
3018 ; X32-SSE-NEXT: psllw $9, %xmm0
3019 ; X32-SSE-NEXT: por %xmm1, %xmm0
3020 ; X32-SSE-NEXT: retl
3021 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
3025 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
3026 ; SSE-LABEL: splatconstant_funnnel_v16i8:
3028 ; SSE-NEXT: psrlw $4, %xmm1
3029 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3030 ; SSE-NEXT: psllw $4, %xmm0
3031 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3032 ; SSE-NEXT: por %xmm1, %xmm0
3035 ; AVX-LABEL: splatconstant_funnnel_v16i8:
3037 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
3038 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3039 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
3040 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3041 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
3044 ; AVX512-LABEL: splatconstant_funnnel_v16i8:
3046 ; AVX512-NEXT: vpsrlw $4, %xmm1, %xmm1
3047 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3048 ; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
3049 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3050 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
3053 ; XOP-LABEL: splatconstant_funnnel_v16i8:
3055 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
3056 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3057 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
3060 ; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
3062 ; X32-SSE-NEXT: psrlw $4, %xmm1
3063 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
3064 ; X32-SSE-NEXT: psllw $4, %xmm0
3065 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
3066 ; X32-SSE-NEXT: por %xmm1, %xmm0
3067 ; X32-SSE-NEXT: retl
3068 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)