1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512VLBW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefix=AVX512VBMI2
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefix=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE2
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,63]
31 ; SSE2-NEXT: pxor %xmm3, %xmm3
32 ; SSE2-NEXT: psubq %xmm1, %xmm3
33 ; SSE2-NEXT: pand %xmm2, %xmm1
34 ; SSE2-NEXT: movdqa %xmm0, %xmm4
35 ; SSE2-NEXT: psrlq %xmm1, %xmm4
36 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
37 ; SSE2-NEXT: movdqa %xmm0, %xmm5
38 ; SSE2-NEXT: psrlq %xmm1, %xmm5
39 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
40 ; SSE2-NEXT: pand %xmm2, %xmm3
41 ; SSE2-NEXT: movdqa %xmm0, %xmm1
42 ; SSE2-NEXT: psllq %xmm3, %xmm1
43 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
44 ; SSE2-NEXT: psllq %xmm2, %xmm0
45 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
46 ; SSE2-NEXT: orpd %xmm5, %xmm0
49 ; SSE41-LABEL: var_funnnel_v2i64:
51 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [63,63]
52 ; SSE41-NEXT: pxor %xmm3, %xmm3
53 ; SSE41-NEXT: psubq %xmm1, %xmm3
54 ; SSE41-NEXT: pand %xmm2, %xmm1
55 ; SSE41-NEXT: movdqa %xmm0, %xmm4
56 ; SSE41-NEXT: psrlq %xmm1, %xmm4
57 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
58 ; SSE41-NEXT: movdqa %xmm0, %xmm5
59 ; SSE41-NEXT: psrlq %xmm1, %xmm5
60 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5,6,7]
61 ; SSE41-NEXT: pand %xmm2, %xmm3
62 ; SSE41-NEXT: movdqa %xmm0, %xmm1
63 ; SSE41-NEXT: psllq %xmm3, %xmm1
64 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
65 ; SSE41-NEXT: psllq %xmm2, %xmm0
66 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
67 ; SSE41-NEXT: por %xmm5, %xmm0
70 ; AVX1-LABEL: var_funnnel_v2i64:
72 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
73 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
74 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4
75 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
76 ; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
77 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
78 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
79 ; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1
80 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
81 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
82 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
83 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
84 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
85 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
88 ; AVX2-LABEL: var_funnnel_v2i64:
90 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
91 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
92 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3
93 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
94 ; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1
95 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
96 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
97 ; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
100 ; AVX512F-LABEL: var_funnnel_v2i64:
102 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
103 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
104 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
105 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
106 ; AVX512F-NEXT: vzeroupper
109 ; AVX512VL-LABEL: var_funnnel_v2i64:
111 ; AVX512VL-NEXT: vprorvq %xmm1, %xmm0, %xmm0
112 ; AVX512VL-NEXT: retq
114 ; AVX512BW-LABEL: var_funnnel_v2i64:
116 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
117 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
118 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
119 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
120 ; AVX512BW-NEXT: vzeroupper
121 ; AVX512BW-NEXT: retq
123 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
124 ; AVX512VLBW: # %bb.0:
125 ; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0
126 ; AVX512VLBW-NEXT: retq
128 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
129 ; AVX512VBMI2: # %bb.0:
130 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
131 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
132 ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
133 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
134 ; AVX512VBMI2-NEXT: vzeroupper
135 ; AVX512VBMI2-NEXT: retq
137 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
138 ; AVX512VLVBMI2: # %bb.0:
139 ; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0
140 ; AVX512VLVBMI2-NEXT: retq
142 ; XOP-LABEL: var_funnnel_v2i64:
144 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
145 ; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
146 ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
149 ; X86-SSE2-LABEL: var_funnnel_v2i64:
151 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
152 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
153 ; X86-SSE2-NEXT: psubq %xmm1, %xmm3
154 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
155 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
156 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm4
157 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
158 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
159 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm5
160 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
161 ; X86-SSE2-NEXT: pand %xmm2, %xmm3
162 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
163 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1
164 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
165 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
166 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
167 ; X86-SSE2-NEXT: orpd %xmm5, %xmm0
168 ; X86-SSE2-NEXT: retl
169 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt)
173 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
174 ; SSE2-LABEL: var_funnnel_v4i32:
176 ; SSE2-NEXT: pxor %xmm2, %xmm2
177 ; SSE2-NEXT: psubd %xmm1, %xmm2
178 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
179 ; SSE2-NEXT: pslld $23, %xmm2
180 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
181 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
182 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
183 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
184 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
185 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
186 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
187 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
188 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
189 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
190 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
191 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
192 ; SSE2-NEXT: por %xmm3, %xmm0
195 ; SSE41-LABEL: var_funnnel_v4i32:
197 ; SSE41-NEXT: pxor %xmm2, %xmm2
198 ; SSE41-NEXT: psubd %xmm1, %xmm2
199 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
200 ; SSE41-NEXT: pslld $23, %xmm2
201 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
202 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
203 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
204 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
205 ; SSE41-NEXT: pmuludq %xmm2, %xmm3
206 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
207 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
208 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
209 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
210 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
211 ; SSE41-NEXT: por %xmm1, %xmm0
214 ; AVX1-LABEL: var_funnnel_v4i32:
216 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
217 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
218 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
219 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
220 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
221 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
222 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
223 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
224 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
225 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
226 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
227 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
228 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
229 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
230 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
233 ; AVX2-LABEL: var_funnnel_v4i32:
235 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
236 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
237 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
238 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
239 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2
240 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
241 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
242 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
243 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
246 ; AVX512F-LABEL: var_funnnel_v4i32:
248 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
249 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
250 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
251 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
252 ; AVX512F-NEXT: vzeroupper
255 ; AVX512VL-LABEL: var_funnnel_v4i32:
257 ; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0
258 ; AVX512VL-NEXT: retq
260 ; AVX512BW-LABEL: var_funnnel_v4i32:
262 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
263 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
264 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
265 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
266 ; AVX512BW-NEXT: vzeroupper
267 ; AVX512BW-NEXT: retq
269 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
270 ; AVX512VLBW: # %bb.0:
271 ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
272 ; AVX512VLBW-NEXT: retq
274 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
275 ; AVX512VBMI2: # %bb.0:
276 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
277 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
278 ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
279 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
280 ; AVX512VBMI2-NEXT: vzeroupper
281 ; AVX512VBMI2-NEXT: retq
283 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
284 ; AVX512VLVBMI2: # %bb.0:
285 ; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
286 ; AVX512VLVBMI2-NEXT: retq
288 ; XOP-LABEL: var_funnnel_v4i32:
290 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
291 ; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm1
292 ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
295 ; X86-SSE2-LABEL: var_funnnel_v4i32:
297 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
298 ; X86-SSE2-NEXT: psubd %xmm1, %xmm2
299 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
300 ; X86-SSE2-NEXT: pslld $23, %xmm2
301 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
302 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
303 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
304 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
305 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
306 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
307 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
308 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
309 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
310 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
311 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
312 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
313 ; X86-SSE2-NEXT: por %xmm3, %xmm0
314 ; X86-SSE2-NEXT: retl
315 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %amt)
319 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
320 ; SSE2-LABEL: var_funnnel_v8i16:
322 ; SSE2-NEXT: pxor %xmm2, %xmm2
323 ; SSE2-NEXT: psubw %xmm1, %xmm2
324 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
325 ; SSE2-NEXT: movdqa %xmm2, %xmm1
326 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
327 ; SSE2-NEXT: pslld $23, %xmm1
328 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
329 ; SSE2-NEXT: paddd %xmm3, %xmm1
330 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
331 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
332 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
333 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
334 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
335 ; SSE2-NEXT: pslld $23, %xmm2
336 ; SSE2-NEXT: paddd %xmm3, %xmm2
337 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
338 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
339 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
340 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
341 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
342 ; SSE2-NEXT: movdqa %xmm0, %xmm1
343 ; SSE2-NEXT: pmulhuw %xmm2, %xmm1
344 ; SSE2-NEXT: pmullw %xmm2, %xmm0
345 ; SSE2-NEXT: por %xmm1, %xmm0
348 ; SSE41-LABEL: var_funnnel_v8i16:
350 ; SSE41-NEXT: pxor %xmm2, %xmm2
351 ; SSE41-NEXT: psubw %xmm1, %xmm2
352 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
353 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
354 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
355 ; SSE41-NEXT: pslld $23, %xmm2
356 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
357 ; SSE41-NEXT: paddd %xmm3, %xmm2
358 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
359 ; SSE41-NEXT: pslld $23, %xmm1
360 ; SSE41-NEXT: paddd %xmm3, %xmm1
361 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
362 ; SSE41-NEXT: packusdw %xmm2, %xmm1
363 ; SSE41-NEXT: movdqa %xmm0, %xmm2
364 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
365 ; SSE41-NEXT: pmullw %xmm1, %xmm0
366 ; SSE41-NEXT: por %xmm2, %xmm0
369 ; AVX1-LABEL: var_funnnel_v8i16:
371 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
372 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
373 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
374 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
375 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
376 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
377 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
378 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
379 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
380 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
381 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
382 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
383 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
384 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
385 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
386 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
389 ; AVX2-LABEL: var_funnnel_v8i16:
391 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
392 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
393 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
394 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
395 ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2
396 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
397 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
398 ; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2
399 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
400 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
401 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
402 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
403 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
404 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
405 ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
406 ; AVX2-NEXT: vzeroupper
409 ; AVX512F-LABEL: var_funnnel_v8i16:
411 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
412 ; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1
413 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
414 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
415 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
416 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
417 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
418 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
419 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
420 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
421 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
422 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
423 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
424 ; AVX512F-NEXT: vzeroupper
427 ; AVX512VL-LABEL: var_funnnel_v8i16:
429 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
430 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1
431 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
432 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
433 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
434 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2
435 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
436 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
437 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
438 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
439 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
440 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
441 ; AVX512VL-NEXT: vzeroupper
442 ; AVX512VL-NEXT: retq
444 ; AVX512BW-LABEL: var_funnnel_v8i16:
446 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
447 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
448 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
449 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
450 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
451 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
452 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
453 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
454 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
455 ; AVX512BW-NEXT: vzeroupper
456 ; AVX512BW-NEXT: retq
458 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
459 ; AVX512VLBW: # %bb.0:
460 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
461 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
462 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
463 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
464 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
465 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
466 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
467 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
468 ; AVX512VLBW-NEXT: retq
470 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
471 ; AVX512VBMI2: # %bb.0:
472 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
473 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
474 ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0
475 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
476 ; AVX512VBMI2-NEXT: vzeroupper
477 ; AVX512VBMI2-NEXT: retq
479 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
480 ; AVX512VLVBMI2: # %bb.0:
481 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm1, %xmm0, %xmm0
482 ; AVX512VLVBMI2-NEXT: retq
484 ; XOP-LABEL: var_funnnel_v8i16:
486 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
487 ; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
488 ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
491 ; X86-SSE2-LABEL: var_funnnel_v8i16:
493 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
494 ; X86-SSE2-NEXT: psubw %xmm1, %xmm2
495 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
496 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
497 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
498 ; X86-SSE2-NEXT: pslld $23, %xmm1
499 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
500 ; X86-SSE2-NEXT: paddd %xmm3, %xmm1
501 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
502 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
503 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
504 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
505 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
506 ; X86-SSE2-NEXT: pslld $23, %xmm2
507 ; X86-SSE2-NEXT: paddd %xmm3, %xmm2
508 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
509 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
510 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
511 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
512 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
513 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
514 ; X86-SSE2-NEXT: pmulhuw %xmm2, %xmm1
515 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
516 ; X86-SSE2-NEXT: por %xmm1, %xmm0
517 ; X86-SSE2-NEXT: retl
518 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %amt)
522 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
523 ; SSE2-LABEL: var_funnnel_v16i8:
525 ; SSE2-NEXT: movdqa %xmm0, %xmm2
526 ; SSE2-NEXT: pxor %xmm0, %xmm0
527 ; SSE2-NEXT: pxor %xmm3, %xmm3
528 ; SSE2-NEXT: psubb %xmm1, %xmm3
529 ; SSE2-NEXT: psllw $5, %xmm3
530 ; SSE2-NEXT: pxor %xmm1, %xmm1
531 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
532 ; SSE2-NEXT: movdqa %xmm2, %xmm4
533 ; SSE2-NEXT: psrlw $4, %xmm4
534 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
535 ; SSE2-NEXT: movdqa %xmm2, %xmm5
536 ; SSE2-NEXT: psllw $4, %xmm5
537 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
538 ; SSE2-NEXT: por %xmm4, %xmm5
539 ; SSE2-NEXT: pand %xmm1, %xmm5
540 ; SSE2-NEXT: pandn %xmm2, %xmm1
541 ; SSE2-NEXT: por %xmm5, %xmm1
542 ; SSE2-NEXT: movdqa %xmm1, %xmm2
543 ; SSE2-NEXT: psrlw $6, %xmm2
544 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
545 ; SSE2-NEXT: movdqa %xmm1, %xmm4
546 ; SSE2-NEXT: psllw $2, %xmm4
547 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
548 ; SSE2-NEXT: por %xmm2, %xmm4
549 ; SSE2-NEXT: paddb %xmm3, %xmm3
550 ; SSE2-NEXT: pxor %xmm2, %xmm2
551 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
552 ; SSE2-NEXT: pand %xmm2, %xmm4
553 ; SSE2-NEXT: pandn %xmm1, %xmm2
554 ; SSE2-NEXT: por %xmm4, %xmm2
555 ; SSE2-NEXT: movdqa %xmm2, %xmm1
556 ; SSE2-NEXT: paddb %xmm2, %xmm1
557 ; SSE2-NEXT: movdqa %xmm2, %xmm4
558 ; SSE2-NEXT: psrlw $7, %xmm4
559 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
560 ; SSE2-NEXT: por %xmm1, %xmm4
561 ; SSE2-NEXT: paddb %xmm3, %xmm3
562 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm0
563 ; SSE2-NEXT: pand %xmm0, %xmm4
564 ; SSE2-NEXT: pandn %xmm2, %xmm0
565 ; SSE2-NEXT: por %xmm4, %xmm0
568 ; SSE41-LABEL: var_funnnel_v16i8:
570 ; SSE41-NEXT: movdqa %xmm0, %xmm2
571 ; SSE41-NEXT: psrlw $4, %xmm0
572 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
573 ; SSE41-NEXT: movdqa %xmm2, %xmm3
574 ; SSE41-NEXT: psllw $4, %xmm3
575 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
576 ; SSE41-NEXT: por %xmm0, %xmm3
577 ; SSE41-NEXT: pxor %xmm0, %xmm0
578 ; SSE41-NEXT: psubb %xmm1, %xmm0
579 ; SSE41-NEXT: psllw $5, %xmm0
580 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
581 ; SSE41-NEXT: movdqa %xmm2, %xmm1
582 ; SSE41-NEXT: psrlw $6, %xmm1
583 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
584 ; SSE41-NEXT: movdqa %xmm2, %xmm3
585 ; SSE41-NEXT: psllw $2, %xmm3
586 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
587 ; SSE41-NEXT: por %xmm1, %xmm3
588 ; SSE41-NEXT: paddb %xmm0, %xmm0
589 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
590 ; SSE41-NEXT: movdqa %xmm2, %xmm1
591 ; SSE41-NEXT: paddb %xmm2, %xmm1
592 ; SSE41-NEXT: movdqa %xmm2, %xmm3
593 ; SSE41-NEXT: psrlw $7, %xmm3
594 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
595 ; SSE41-NEXT: por %xmm1, %xmm3
596 ; SSE41-NEXT: paddb %xmm0, %xmm0
597 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
598 ; SSE41-NEXT: movdqa %xmm2, %xmm0
601 ; AVX-LABEL: var_funnnel_v16i8:
603 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
604 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
605 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm3
606 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
607 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
608 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
609 ; AVX-NEXT: vpsubb %xmm1, %xmm3, %xmm1
610 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
611 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
612 ; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2
613 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
614 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm3
615 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
616 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
617 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
618 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
619 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
620 ; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3
621 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
622 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
623 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
624 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
627 ; AVX512F-LABEL: var_funnnel_v16i8:
629 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
630 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
631 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
632 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
633 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3
634 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
635 ; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1
636 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
637 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
638 ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
639 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
640 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
641 ; AVX512F-NEXT: vzeroupper
644 ; AVX512VL-LABEL: var_funnnel_v16i8:
646 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
647 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
648 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero
649 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
650 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm3
651 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
652 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1
653 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
654 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
655 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
656 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
657 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
658 ; AVX512VL-NEXT: vzeroupper
659 ; AVX512VL-NEXT: retq
661 ; AVX512BW-LABEL: var_funnnel_v16i8:
663 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
664 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
665 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
666 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
667 ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
668 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
669 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
670 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
671 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
672 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
673 ; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0
674 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
675 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
676 ; AVX512BW-NEXT: vzeroupper
677 ; AVX512BW-NEXT: retq
679 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
680 ; AVX512VLBW: # %bb.0:
681 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
682 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
683 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
684 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
685 ; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3
686 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
687 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
688 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
689 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
690 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
691 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0
692 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
693 ; AVX512VLBW-NEXT: vzeroupper
694 ; AVX512VLBW-NEXT: retq
696 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
697 ; AVX512VBMI2: # %bb.0:
698 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
699 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
700 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
701 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
702 ; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
703 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
704 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
705 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
706 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
707 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
708 ; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
709 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
710 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
711 ; AVX512VBMI2-NEXT: vzeroupper
712 ; AVX512VBMI2-NEXT: retq
714 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
715 ; AVX512VLVBMI2: # %bb.0:
716 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
717 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
718 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
719 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
720 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3
721 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
722 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
723 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
724 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
725 ; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
726 ; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
727 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
728 ; AVX512VLVBMI2-NEXT: vzeroupper
729 ; AVX512VLVBMI2-NEXT: retq
731 ; XOP-LABEL: var_funnnel_v16i8:
733 ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
734 ; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
735 ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
738 ; X86-SSE2-LABEL: var_funnnel_v16i8:
740 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
741 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0
742 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
743 ; X86-SSE2-NEXT: psubb %xmm1, %xmm3
744 ; X86-SSE2-NEXT: psllw $5, %xmm3
745 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
746 ; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm1
747 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
748 ; X86-SSE2-NEXT: psrlw $4, %xmm4
749 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
750 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
751 ; X86-SSE2-NEXT: psllw $4, %xmm5
752 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
753 ; X86-SSE2-NEXT: por %xmm4, %xmm5
754 ; X86-SSE2-NEXT: pand %xmm1, %xmm5
755 ; X86-SSE2-NEXT: pandn %xmm2, %xmm1
756 ; X86-SSE2-NEXT: por %xmm5, %xmm1
757 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
758 ; X86-SSE2-NEXT: psrlw $6, %xmm2
759 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
760 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
761 ; X86-SSE2-NEXT: psllw $2, %xmm4
762 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
763 ; X86-SSE2-NEXT: por %xmm2, %xmm4
764 ; X86-SSE2-NEXT: paddb %xmm3, %xmm3
765 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
766 ; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm2
767 ; X86-SSE2-NEXT: pand %xmm2, %xmm4
768 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2
769 ; X86-SSE2-NEXT: por %xmm4, %xmm2
770 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
771 ; X86-SSE2-NEXT: paddb %xmm2, %xmm1
772 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
773 ; X86-SSE2-NEXT: psrlw $7, %xmm4
774 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
775 ; X86-SSE2-NEXT: por %xmm1, %xmm4
776 ; X86-SSE2-NEXT: paddb %xmm3, %xmm3
777 ; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm0
778 ; X86-SSE2-NEXT: pand %xmm0, %xmm4
779 ; X86-SSE2-NEXT: pandn %xmm2, %xmm0
780 ; X86-SSE2-NEXT: por %xmm4, %xmm0
781 ; X86-SSE2-NEXT: retl
782 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %amt)
787 ; Uniform Variable Shifts
790 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind {
791 ; SSE-LABEL: splatvar_funnnel_v2i64:
793 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [63,63]
794 ; SSE-NEXT: pxor %xmm3, %xmm3
795 ; SSE-NEXT: psubq %xmm1, %xmm3
796 ; SSE-NEXT: pand %xmm2, %xmm1
797 ; SSE-NEXT: movdqa %xmm0, %xmm4
798 ; SSE-NEXT: psrlq %xmm1, %xmm4
799 ; SSE-NEXT: pand %xmm2, %xmm3
800 ; SSE-NEXT: psllq %xmm3, %xmm0
801 ; SSE-NEXT: por %xmm4, %xmm0
804 ; AVX-LABEL: splatvar_funnnel_v2i64:
806 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
807 ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
808 ; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3
809 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
810 ; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1
811 ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
812 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
813 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
816 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
818 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
819 ; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
820 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
821 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
822 ; AVX512F-NEXT: vzeroupper
825 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
827 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
828 ; AVX512VL-NEXT: vprorvq %xmm1, %xmm0, %xmm0
829 ; AVX512VL-NEXT: retq
831 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
833 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
834 ; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1
835 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
836 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
837 ; AVX512BW-NEXT: vzeroupper
838 ; AVX512BW-NEXT: retq
840 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
841 ; AVX512VLBW: # %bb.0:
842 ; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1
843 ; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0
844 ; AVX512VLBW-NEXT: retq
846 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
847 ; AVX512VBMI2: # %bb.0:
848 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
849 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
850 ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
851 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
852 ; AVX512VBMI2-NEXT: vzeroupper
853 ; AVX512VBMI2-NEXT: retq
855 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
856 ; AVX512VLVBMI2: # %bb.0:
857 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
858 ; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0
859 ; AVX512VLVBMI2-NEXT: retq
861 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
863 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
864 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
865 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
866 ; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0
869 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
871 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
872 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
873 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
874 ; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0
877 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
879 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
880 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
881 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
882 ; X86-SSE2-NEXT: psubq %xmm1, %xmm3
883 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
884 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
885 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm4
886 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
887 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
888 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm5
889 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
890 ; X86-SSE2-NEXT: pand %xmm2, %xmm3
891 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
892 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1
893 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
894 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
895 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
896 ; X86-SSE2-NEXT: orpd %xmm5, %xmm0
897 ; X86-SSE2-NEXT: retl
898 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
899 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat)
903 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind {
904 ; SSE2-LABEL: splatvar_funnnel_v4i32:
906 ; SSE2-NEXT: movd %xmm1, %eax
907 ; SSE2-NEXT: negl %eax
908 ; SSE2-NEXT: andl $31, %eax
909 ; SSE2-NEXT: movd %eax, %xmm1
910 ; SSE2-NEXT: movdqa %xmm0, %xmm2
911 ; SSE2-NEXT: pslld %xmm1, %xmm2
912 ; SSE2-NEXT: movl $32, %ecx
913 ; SSE2-NEXT: subl %eax, %ecx
914 ; SSE2-NEXT: movd %ecx, %xmm1
915 ; SSE2-NEXT: psrld %xmm1, %xmm0
916 ; SSE2-NEXT: por %xmm2, %xmm0
919 ; SSE41-LABEL: splatvar_funnnel_v4i32:
921 ; SSE41-NEXT: pxor %xmm2, %xmm2
922 ; SSE41-NEXT: psubd %xmm1, %xmm2
923 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
924 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
925 ; SSE41-NEXT: movdqa %xmm0, %xmm3
926 ; SSE41-NEXT: pslld %xmm1, %xmm3
927 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32]
928 ; SSE41-NEXT: psubd %xmm2, %xmm1
929 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
930 ; SSE41-NEXT: psrld %xmm1, %xmm0
931 ; SSE41-NEXT: por %xmm3, %xmm0
934 ; AVX1-LABEL: splatvar_funnnel_v4i32:
936 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
937 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
938 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
939 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
940 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
941 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
942 ; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
943 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
944 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
945 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
948 ; AVX2-LABEL: splatvar_funnnel_v4i32:
950 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
951 ; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
952 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
953 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
954 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
955 ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
956 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
957 ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
958 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
959 ; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
960 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
963 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
965 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
966 ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
967 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
968 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
969 ; AVX512F-NEXT: vzeroupper
972 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
974 ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
975 ; AVX512VL-NEXT: vprorvd %xmm1, %xmm0, %xmm0
976 ; AVX512VL-NEXT: retq
978 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
980 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
981 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
982 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
983 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
984 ; AVX512BW-NEXT: vzeroupper
985 ; AVX512BW-NEXT: retq
987 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
988 ; AVX512VLBW: # %bb.0:
989 ; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1
990 ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
991 ; AVX512VLBW-NEXT: retq
993 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
994 ; AVX512VBMI2: # %bb.0:
995 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
996 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
997 ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
998 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
999 ; AVX512VBMI2-NEXT: vzeroupper
1000 ; AVX512VBMI2-NEXT: retq
1002 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1003 ; AVX512VLVBMI2: # %bb.0:
1004 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
1005 ; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
1006 ; AVX512VLVBMI2-NEXT: retq
1008 ; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1010 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1011 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1012 ; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
1013 ; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0
1014 ; XOPAVX1-NEXT: retq
1016 ; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1018 ; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1
1019 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1020 ; XOPAVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm1
1021 ; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0
1022 ; XOPAVX2-NEXT: retq
1024 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1025 ; X86-SSE2: # %bb.0:
1026 ; X86-SSE2-NEXT: movd %xmm1, %eax
1027 ; X86-SSE2-NEXT: negl %eax
1028 ; X86-SSE2-NEXT: andl $31, %eax
1029 ; X86-SSE2-NEXT: movd %eax, %xmm1
1030 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1031 ; X86-SSE2-NEXT: pslld %xmm1, %xmm2
1032 ; X86-SSE2-NEXT: movl $32, %ecx
1033 ; X86-SSE2-NEXT: subl %eax, %ecx
1034 ; X86-SSE2-NEXT: movd %ecx, %xmm1
1035 ; X86-SSE2-NEXT: psrld %xmm1, %xmm0
1036 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1037 ; X86-SSE2-NEXT: retl
1038 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1039 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
1043 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind {
1044 ; SSE2-LABEL: splatvar_funnnel_v8i16:
1046 ; SSE2-NEXT: pxor %xmm2, %xmm2
1047 ; SSE2-NEXT: psubw %xmm1, %xmm2
1048 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1049 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
1050 ; SSE2-NEXT: pand %xmm2, %xmm1
1051 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1052 ; SSE2-NEXT: psllw %xmm1, %xmm3
1053 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
1054 ; SSE2-NEXT: psubw %xmm2, %xmm1
1055 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1056 ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1057 ; SSE2-NEXT: psrlw %xmm1, %xmm0
1058 ; SSE2-NEXT: por %xmm3, %xmm0
1061 ; SSE41-LABEL: splatvar_funnnel_v8i16:
1063 ; SSE41-NEXT: pxor %xmm2, %xmm2
1064 ; SSE41-NEXT: psubw %xmm1, %xmm2
1065 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1066 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1067 ; SSE41-NEXT: movdqa %xmm0, %xmm3
1068 ; SSE41-NEXT: psllw %xmm1, %xmm3
1069 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
1070 ; SSE41-NEXT: psubw %xmm2, %xmm1
1071 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1072 ; SSE41-NEXT: psrlw %xmm1, %xmm0
1073 ; SSE41-NEXT: por %xmm3, %xmm0
1076 ; AVX-LABEL: splatvar_funnnel_v8i16:
1078 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
1079 ; AVX-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1080 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1081 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1082 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm2
1083 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1084 ; AVX-NEXT: vpsubw %xmm1, %xmm3, %xmm1
1085 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1086 ; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1087 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
1090 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1092 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
1093 ; AVX512F-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1094 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1095 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1096 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2
1097 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1098 ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
1099 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1100 ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1101 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0
1102 ; AVX512F-NEXT: retq
1104 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1105 ; AVX512VL: # %bb.0:
1106 ; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1107 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1108 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1109 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1110 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2
1111 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1112 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
1113 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1114 ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1115 ; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0
1116 ; AVX512VL-NEXT: retq
1118 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1119 ; AVX512BW: # %bb.0:
1120 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1121 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1122 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1123 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1124 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm2
1125 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1126 ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
1127 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1128 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1129 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
1130 ; AVX512BW-NEXT: retq
1132 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1133 ; AVX512VLBW: # %bb.0:
1134 ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1135 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1136 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1137 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1138 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2
1139 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1140 ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1
1141 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1142 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1143 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
1144 ; AVX512VLBW-NEXT: retq
1146 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1147 ; AVX512VBMI2: # %bb.0:
1148 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1149 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
1150 ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0
1151 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1152 ; AVX512VBMI2-NEXT: vzeroupper
1153 ; AVX512VBMI2-NEXT: retq
1155 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1156 ; AVX512VLVBMI2: # %bb.0:
1157 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
1158 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm1, %xmm0, %xmm0
1159 ; AVX512VLVBMI2-NEXT: retq
1161 ; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1163 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1164 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1165 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1166 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1167 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0
1168 ; XOPAVX1-NEXT: retq
1170 ; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1172 ; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1
1173 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1174 ; XOPAVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
1175 ; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0
1176 ; XOPAVX2-NEXT: retq
1178 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1179 ; X86-SSE2: # %bb.0:
1180 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
1181 ; X86-SSE2-NEXT: psubw %xmm1, %xmm2
1182 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1183 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
1184 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
1185 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
1186 ; X86-SSE2-NEXT: psllw %xmm1, %xmm3
1187 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
1188 ; X86-SSE2-NEXT: psubw %xmm2, %xmm1
1189 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1190 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1191 ; X86-SSE2-NEXT: psrlw %xmm1, %xmm0
1192 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1193 ; X86-SSE2-NEXT: retl
1194 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1195 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> %splat)
1199 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind {
1200 ; SSE2-LABEL: splatvar_funnnel_v16i8:
1202 ; SSE2-NEXT: pxor %xmm2, %xmm2
1203 ; SSE2-NEXT: psubb %xmm1, %xmm2
1204 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1205 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1206 ; SSE2-NEXT: psubb %xmm2, %xmm3
1207 ; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1208 ; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1209 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1210 ; SSE2-NEXT: psllw %xmm2, %xmm1
1211 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1212 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
1213 ; SSE2-NEXT: psllw %xmm2, %xmm5
1214 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1215 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
1216 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1217 ; SSE2-NEXT: pand %xmm2, %xmm1
1218 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1219 ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1220 ; SSE2-NEXT: psrlw %xmm3, %xmm0
1221 ; SSE2-NEXT: psrlw %xmm3, %xmm4
1222 ; SSE2-NEXT: psrlw $8, %xmm4
1223 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1224 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
1225 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1226 ; SSE2-NEXT: pand %xmm0, %xmm2
1227 ; SSE2-NEXT: por %xmm2, %xmm1
1228 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1231 ; SSE41-LABEL: splatvar_funnnel_v16i8:
1233 ; SSE41-NEXT: pxor %xmm2, %xmm2
1234 ; SSE41-NEXT: pxor %xmm3, %xmm3
1235 ; SSE41-NEXT: psubb %xmm1, %xmm3
1236 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1237 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1238 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1239 ; SSE41-NEXT: psllw %xmm4, %xmm1
1240 ; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
1241 ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1242 ; SSE41-NEXT: psllw %xmm4, %xmm6
1243 ; SSE41-NEXT: pshufb %xmm2, %xmm6
1244 ; SSE41-NEXT: pand %xmm6, %xmm1
1245 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1246 ; SSE41-NEXT: psubb %xmm3, %xmm2
1247 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1248 ; SSE41-NEXT: psrlw %xmm2, %xmm0
1249 ; SSE41-NEXT: psrlw %xmm2, %xmm5
1250 ; SSE41-NEXT: pshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1251 ; SSE41-NEXT: pand %xmm0, %xmm5
1252 ; SSE41-NEXT: por %xmm5, %xmm1
1253 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1256 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1258 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1259 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1260 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1261 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1262 ; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm4
1263 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1264 ; AVX1-NEXT: vpsllw %xmm3, %xmm5, %xmm3
1265 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
1266 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1267 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1268 ; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1269 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1270 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1271 ; AVX1-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
1272 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1273 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
1274 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1277 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1279 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1280 ; AVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1281 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1282 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1283 ; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3
1284 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
1285 ; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
1286 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1287 ; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2
1288 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1289 ; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1290 ; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1291 ; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
1292 ; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
1293 ; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
1294 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1295 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
1296 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
1299 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1301 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1302 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
1303 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1304 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1305 ; AVX512F-NEXT: vpsrld %xmm3, %zmm0, %zmm3
1306 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
1307 ; AVX512F-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1308 ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
1309 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1310 ; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm0
1311 ; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
1312 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1313 ; AVX512F-NEXT: vzeroupper
1314 ; AVX512F-NEXT: retq
1316 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1317 ; AVX512VL: # %bb.0:
1318 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1319 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
1320 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1321 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1322 ; AVX512VL-NEXT: vpsrld %xmm3, %zmm0, %zmm3
1323 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
1324 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1325 ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1
1326 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1327 ; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm0
1328 ; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
1329 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1330 ; AVX512VL-NEXT: vzeroupper
1331 ; AVX512VL-NEXT: retq
1333 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1334 ; AVX512BW: # %bb.0:
1335 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1336 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
1337 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1338 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1339 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
1340 ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
1341 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1342 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
1343 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1344 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1345 ; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm0
1346 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1347 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1348 ; AVX512BW-NEXT: vzeroupper
1349 ; AVX512BW-NEXT: retq
1351 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1352 ; AVX512VLBW: # %bb.0:
1353 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1354 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
1355 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1356 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1357 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
1358 ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
1359 ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1360 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
1361 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1362 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1363 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm3, %ymm0
1364 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1365 ; AVX512VLBW-NEXT: vzeroupper
1366 ; AVX512VLBW-NEXT: retq
1368 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1369 ; AVX512VBMI2: # %bb.0:
1370 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1371 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
1372 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1373 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1374 ; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
1375 ; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1376 ; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1377 ; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
1378 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1379 ; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1380 ; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
1381 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1382 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1383 ; AVX512VBMI2-NEXT: vzeroupper
1384 ; AVX512VBMI2-NEXT: retq
1386 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1387 ; AVX512VLVBMI2: # %bb.0:
1388 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1389 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
1390 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1391 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1392 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
1393 ; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
1394 ; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1395 ; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
1396 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1397 ; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1398 ; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
1399 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1400 ; AVX512VLVBMI2-NEXT: vzeroupper
1401 ; AVX512VLVBMI2-NEXT: retq
1403 ; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
1405 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1406 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1407 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1408 ; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0
1409 ; XOPAVX1-NEXT: retq
1411 ; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
1413 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
1414 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1415 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
1416 ; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0
1417 ; XOPAVX2-NEXT: retq
1419 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1420 ; X86-SSE2: # %bb.0:
1421 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
1422 ; X86-SSE2-NEXT: psubb %xmm1, %xmm2
1423 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1424 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1425 ; X86-SSE2-NEXT: psubb %xmm2, %xmm3
1426 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1427 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1428 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1429 ; X86-SSE2-NEXT: psllw %xmm2, %xmm1
1430 ; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1431 ; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5
1432 ; X86-SSE2-NEXT: psllw %xmm2, %xmm5
1433 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1434 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,0,0,0,4,5,6,7]
1435 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1436 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
1437 ; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1438 ; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1439 ; X86-SSE2-NEXT: psrlw %xmm3, %xmm0
1440 ; X86-SSE2-NEXT: psrlw %xmm3, %xmm4
1441 ; X86-SSE2-NEXT: psrlw $8, %xmm4
1442 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1443 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
1444 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1445 ; X86-SSE2-NEXT: pand %xmm0, %xmm2
1446 ; X86-SSE2-NEXT: por %xmm2, %xmm1
1447 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
1448 ; X86-SSE2-NEXT: retl
1449 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1450 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> %splat)
1458 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind {
1459 ; SSE2-LABEL: constant_funnnel_v2i64:
1461 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1462 ; SSE2-NEXT: psllq $60, %xmm1
1463 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1464 ; SSE2-NEXT: psllq $50, %xmm2
1465 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1466 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1467 ; SSE2-NEXT: psrlq $4, %xmm1
1468 ; SSE2-NEXT: psrlq $14, %xmm0
1469 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1470 ; SSE2-NEXT: orpd %xmm2, %xmm0
1473 ; SSE41-LABEL: constant_funnnel_v2i64:
1475 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1476 ; SSE41-NEXT: psllq $50, %xmm1
1477 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1478 ; SSE41-NEXT: psllq $60, %xmm2
1479 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1480 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1481 ; SSE41-NEXT: psrlq $14, %xmm1
1482 ; SSE41-NEXT: psrlq $4, %xmm0
1483 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1484 ; SSE41-NEXT: por %xmm2, %xmm0
1487 ; AVX1-LABEL: constant_funnnel_v2i64:
1489 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm1
1490 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm2
1491 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1492 ; AVX1-NEXT: vpsrlq $14, %xmm0, %xmm2
1493 ; AVX1-NEXT: vpsrlq $4, %xmm0, %xmm0
1494 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1495 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1498 ; AVX2-LABEL: constant_funnnel_v2i64:
1500 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1501 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1502 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1505 ; AVX512F-LABEL: constant_funnnel_v2i64:
1507 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1508 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
1509 ; AVX512F-NEXT: vprorvq %zmm1, %zmm0, %zmm0
1510 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1511 ; AVX512F-NEXT: vzeroupper
1512 ; AVX512F-NEXT: retq
1514 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1515 ; AVX512VL: # %bb.0:
1516 ; AVX512VL-NEXT: vprorvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1517 ; AVX512VL-NEXT: retq
1519 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1520 ; AVX512BW: # %bb.0:
1521 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1522 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
1523 ; AVX512BW-NEXT: vprorvq %zmm1, %zmm0, %zmm0
1524 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1525 ; AVX512BW-NEXT: vzeroupper
1526 ; AVX512BW-NEXT: retq
1528 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1529 ; AVX512VLBW: # %bb.0:
1530 ; AVX512VLBW-NEXT: vprorvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1531 ; AVX512VLBW-NEXT: retq
1533 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1534 ; AVX512VBMI2: # %bb.0:
1535 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1536 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
1537 ; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
1538 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1539 ; AVX512VBMI2-NEXT: vzeroupper
1540 ; AVX512VBMI2-NEXT: retq
1542 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1543 ; AVX512VLVBMI2: # %bb.0:
1544 ; AVX512VLVBMI2-NEXT: vprorvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1545 ; AVX512VLVBMI2-NEXT: retq
1547 ; XOP-LABEL: constant_funnnel_v2i64:
1549 ; XOP-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1552 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1553 ; X86-SSE2: # %bb.0:
1554 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,0,63,0]
1555 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <4,u,14,u>
1556 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
1557 ; X86-SSE2-NEXT: psubq %xmm2, %xmm3
1558 ; X86-SSE2-NEXT: pand %xmm1, %xmm2
1559 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
1560 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm4
1561 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
1562 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
1563 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm5
1564 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
1565 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1566 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1567 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1
1568 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1569 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1570 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1571 ; X86-SSE2-NEXT: orpd %xmm5, %xmm0
1572 ; X86-SSE2-NEXT: retl
1573 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 4, i64 14>)
1577 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
1578 ; SSE2-LABEL: constant_funnnel_v4i32:
1580 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
1581 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1582 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
1583 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1584 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1585 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
1586 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1587 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1588 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1589 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1590 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1591 ; SSE2-NEXT: por %xmm3, %xmm0
1594 ; SSE41-LABEL: constant_funnnel_v4i32:
1596 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
1597 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1598 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1599 ; SSE41-NEXT: pmuludq %xmm2, %xmm3
1600 ; SSE41-NEXT: pmuludq %xmm1, %xmm0
1601 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1602 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1603 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1604 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1605 ; SSE41-NEXT: por %xmm1, %xmm0
1608 ; AVX1-LABEL: constant_funnnel_v4i32:
1610 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
1611 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1612 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1613 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
1614 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1615 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1616 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1617 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1618 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1619 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1622 ; AVX2-LABEL: constant_funnnel_v4i32:
1624 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1625 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1626 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1629 ; AVX512F-LABEL: constant_funnnel_v4i32:
1631 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1632 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1633 ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0
1634 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1635 ; AVX512F-NEXT: vzeroupper
1636 ; AVX512F-NEXT: retq
1638 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1639 ; AVX512VL: # %bb.0:
1640 ; AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1641 ; AVX512VL-NEXT: retq
1643 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1644 ; AVX512BW: # %bb.0:
1645 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1646 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1647 ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0
1648 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1649 ; AVX512BW-NEXT: vzeroupper
1650 ; AVX512BW-NEXT: retq
1652 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1653 ; AVX512VLBW: # %bb.0:
1654 ; AVX512VLBW-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1655 ; AVX512VLBW-NEXT: retq
1657 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1658 ; AVX512VBMI2: # %bb.0:
1659 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1660 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1661 ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
1662 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1663 ; AVX512VBMI2-NEXT: vzeroupper
1664 ; AVX512VBMI2-NEXT: retq
1666 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1667 ; AVX512VLVBMI2: # %bb.0:
1668 ; AVX512VLVBMI2-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1669 ; AVX512VLVBMI2-NEXT: retq
1671 ; XOP-LABEL: constant_funnnel_v4i32:
1673 ; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1676 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
1677 ; X86-SSE2: # %bb.0:
1678 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [268435456,134217728,67108864,33554432]
1679 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1680 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
1681 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1682 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1683 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
1684 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1685 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1686 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1687 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1688 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1690 ; X86-SSE2-NEXT: retl
1691 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1695 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind {
1696 ; SSE-LABEL: constant_funnnel_v8i16:
1698 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512]
1699 ; SSE-NEXT: movdqa %xmm0, %xmm2
1700 ; SSE-NEXT: pmulhuw %xmm1, %xmm2
1701 ; SSE-NEXT: pmullw %xmm1, %xmm0
1702 ; SSE-NEXT: por %xmm2, %xmm0
1705 ; AVX-LABEL: constant_funnnel_v8i16:
1707 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512]
1708 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1709 ; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1710 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
1713 ; AVX512F-LABEL: constant_funnnel_v8i16:
1715 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512]
1716 ; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1717 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1718 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
1719 ; AVX512F-NEXT: retq
1721 ; AVX512VL-LABEL: constant_funnnel_v8i16:
1722 ; AVX512VL: # %bb.0:
1723 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512]
1724 ; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
1725 ; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
1726 ; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0
1727 ; AVX512VL-NEXT: retq
1729 ; AVX512BW-LABEL: constant_funnnel_v8i16:
1730 ; AVX512BW: # %bb.0:
1731 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1732 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7]
1733 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
1734 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9]
1735 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1736 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1737 ; AVX512BW-NEXT: vzeroupper
1738 ; AVX512BW-NEXT: retq
1740 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
1741 ; AVX512VLBW: # %bb.0:
1742 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1743 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1744 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1745 ; AVX512VLBW-NEXT: retq
1747 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1748 ; AVX512VBMI2: # %bb.0:
1749 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1750 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1751 ; AVX512VBMI2-NEXT: vpshrdvw %zmm1, %zmm0, %zmm0
1752 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1753 ; AVX512VBMI2-NEXT: vzeroupper
1754 ; AVX512VBMI2-NEXT: retq
1756 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
1757 ; AVX512VLVBMI2: # %bb.0:
1758 ; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1759 ; AVX512VLVBMI2-NEXT: retq
1761 ; XOP-LABEL: constant_funnnel_v8i16:
1763 ; XOP-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1766 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
1767 ; X86-SSE2: # %bb.0:
1768 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,32768,16384,8192,4096,2048,1024,512]
1769 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1770 ; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2
1771 ; X86-SSE2-NEXT: pmullw %xmm1, %xmm0
1772 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1773 ; X86-SSE2-NEXT: retl
1774 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
1778 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind {
1779 ; SSE2-LABEL: constant_funnnel_v16i8:
1781 ; SSE2-NEXT: pxor %xmm1, %xmm1
1782 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1783 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1784 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1785 ; SSE2-NEXT: psrlw $8, %xmm2
1786 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1787 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1788 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1789 ; SSE2-NEXT: psrlw $8, %xmm3
1790 ; SSE2-NEXT: packuswb %xmm2, %xmm3
1791 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1792 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1793 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1794 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1795 ; SSE2-NEXT: pand %xmm2, %xmm1
1796 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1797 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1798 ; SSE2-NEXT: pand %xmm2, %xmm0
1799 ; SSE2-NEXT: packuswb %xmm1, %xmm0
1800 ; SSE2-NEXT: por %xmm3, %xmm0
1803 ; SSE41-LABEL: constant_funnnel_v16i8:
1805 ; SSE41-NEXT: movdqa %xmm0, %xmm2
1806 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1807 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1808 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1809 ; SSE41-NEXT: pand %xmm3, %xmm2
1810 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1811 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2]
1812 ; SSE41-NEXT: pmullw %xmm1, %xmm4
1813 ; SSE41-NEXT: pand %xmm3, %xmm4
1814 ; SSE41-NEXT: packuswb %xmm2, %xmm4
1815 ; SSE41-NEXT: pxor %xmm2, %xmm2
1816 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1817 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1818 ; SSE41-NEXT: psrlw $8, %xmm0
1819 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1820 ; SSE41-NEXT: psrlw $8, %xmm1
1821 ; SSE41-NEXT: packuswb %xmm0, %xmm1
1822 ; SSE41-NEXT: por %xmm4, %xmm1
1823 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1826 ; AVX1-LABEL: constant_funnnel_v16i8:
1828 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1829 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1830 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1831 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1832 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1833 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
1834 ; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
1835 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1836 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1837 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1838 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1839 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
1840 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm2
1841 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
1842 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
1843 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
1846 ; AVX2-LABEL: constant_funnnel_v16i8:
1848 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1849 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1850 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
1851 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1852 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1853 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1854 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1855 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1856 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1857 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1858 ; AVX2-NEXT: vzeroupper
1861 ; AVX512F-LABEL: constant_funnnel_v16i8:
1863 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1864 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1865 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1866 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
1867 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1868 ; AVX512F-NEXT: vzeroupper
1869 ; AVX512F-NEXT: retq
1871 ; AVX512VL-LABEL: constant_funnnel_v16i8:
1872 ; AVX512VL: # %bb.0:
1873 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1874 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1875 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1876 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
1877 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1878 ; AVX512VL-NEXT: vzeroupper
1879 ; AVX512VL-NEXT: retq
1881 ; AVX512BW-LABEL: constant_funnnel_v16i8:
1882 ; AVX512BW: # %bb.0:
1883 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1884 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1885 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
1886 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1887 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
1888 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
1889 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1890 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1891 ; AVX512BW-NEXT: vzeroupper
1892 ; AVX512BW-NEXT: retq
1894 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
1895 ; AVX512VLBW: # %bb.0:
1896 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1897 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1898 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1899 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
1900 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1901 ; AVX512VLBW-NEXT: vzeroupper
1902 ; AVX512VLBW-NEXT: retq
1904 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
1905 ; AVX512VBMI2: # %bb.0:
1906 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1907 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1908 ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
1909 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1910 ; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
1911 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
1912 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1913 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1914 ; AVX512VBMI2-NEXT: vzeroupper
1915 ; AVX512VBMI2-NEXT: retq
1917 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
1918 ; AVX512VLVBMI2: # %bb.0:
1919 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1920 ; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1921 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1922 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
1923 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1924 ; AVX512VLVBMI2-NEXT: vzeroupper
1925 ; AVX512VLVBMI2-NEXT: retq
1927 ; XOP-LABEL: constant_funnnel_v16i8:
1929 ; XOP-NEXT: vprotb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1932 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
1933 ; X86-SSE2: # %bb.0:
1934 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1935 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
1936 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1937 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1938 ; X86-SSE2-NEXT: psrlw $8, %xmm2
1939 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
1940 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1941 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1942 ; X86-SSE2-NEXT: psrlw $8, %xmm3
1943 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm3
1944 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1945 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1946 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1947 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1948 ; X86-SSE2-NEXT: pand %xmm2, %xmm1
1949 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1950 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1951 ; X86-SSE2-NEXT: pand %xmm2, %xmm0
1952 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
1953 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1954 ; X86-SSE2-NEXT: retl
1955 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1960 ; Uniform Constant Shifts
1963 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind {
1964 ; SSE-LABEL: splatconstant_funnnel_v2i64:
1966 ; SSE-NEXT: movdqa %xmm0, %xmm1
1967 ; SSE-NEXT: psllq $50, %xmm1
1968 ; SSE-NEXT: psrlq $14, %xmm0
1969 ; SSE-NEXT: por %xmm1, %xmm0
1972 ; AVX-LABEL: splatconstant_funnnel_v2i64:
1974 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm1
1975 ; AVX-NEXT: vpsrlq $14, %xmm0, %xmm0
1976 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1979 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
1981 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1982 ; AVX512F-NEXT: vprorq $14, %zmm0, %zmm0
1983 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1984 ; AVX512F-NEXT: vzeroupper
1985 ; AVX512F-NEXT: retq
1987 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
1988 ; AVX512VL: # %bb.0:
1989 ; AVX512VL-NEXT: vprorq $14, %xmm0, %xmm0
1990 ; AVX512VL-NEXT: retq
1992 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
1993 ; AVX512BW: # %bb.0:
1994 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1995 ; AVX512BW-NEXT: vprorq $14, %zmm0, %zmm0
1996 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1997 ; AVX512BW-NEXT: vzeroupper
1998 ; AVX512BW-NEXT: retq
2000 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2001 ; AVX512VLBW: # %bb.0:
2002 ; AVX512VLBW-NEXT: vprorq $14, %xmm0, %xmm0
2003 ; AVX512VLBW-NEXT: retq
2005 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2006 ; AVX512VBMI2: # %bb.0:
2007 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2008 ; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0
2009 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2010 ; AVX512VBMI2-NEXT: vzeroupper
2011 ; AVX512VBMI2-NEXT: retq
2013 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2014 ; AVX512VLVBMI2: # %bb.0:
2015 ; AVX512VLVBMI2-NEXT: vprorq $14, %xmm0, %xmm0
2016 ; AVX512VLVBMI2-NEXT: retq
2018 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2020 ; XOP-NEXT: vprotq $50, %xmm0, %xmm0
2023 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2024 ; X86-SSE2: # %bb.0:
2025 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
2026 ; X86-SSE2-NEXT: psllq $50, %xmm1
2027 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1]
2028 ; X86-SSE2-NEXT: psrlq $14, %xmm0
2029 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1]
2030 ; X86-SSE2-NEXT: orpd %xmm1, %xmm0
2031 ; X86-SSE2-NEXT: retl
2032 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> <i64 14, i64 14>)
2036 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind {
2037 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2039 ; SSE-NEXT: movdqa %xmm0, %xmm1
2040 ; SSE-NEXT: psrld $4, %xmm1
2041 ; SSE-NEXT: pslld $28, %xmm0
2042 ; SSE-NEXT: por %xmm1, %xmm0
2045 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2047 ; AVX-NEXT: vpsrld $4, %xmm0, %xmm1
2048 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2049 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2052 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2054 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2055 ; AVX512F-NEXT: vprord $4, %zmm0, %zmm0
2056 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2057 ; AVX512F-NEXT: vzeroupper
2058 ; AVX512F-NEXT: retq
2060 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2061 ; AVX512VL: # %bb.0:
2062 ; AVX512VL-NEXT: vprord $4, %xmm0, %xmm0
2063 ; AVX512VL-NEXT: retq
2065 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2066 ; AVX512BW: # %bb.0:
2067 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2068 ; AVX512BW-NEXT: vprord $4, %zmm0, %zmm0
2069 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2070 ; AVX512BW-NEXT: vzeroupper
2071 ; AVX512BW-NEXT: retq
2073 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2074 ; AVX512VLBW: # %bb.0:
2075 ; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0
2076 ; AVX512VLBW-NEXT: retq
2078 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2079 ; AVX512VBMI2: # %bb.0:
2080 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2081 ; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0
2082 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2083 ; AVX512VBMI2-NEXT: vzeroupper
2084 ; AVX512VBMI2-NEXT: retq
2086 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2087 ; AVX512VLVBMI2: # %bb.0:
2088 ; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0
2089 ; AVX512VLVBMI2-NEXT: retq
2091 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2093 ; XOP-NEXT: vprotd $28, %xmm0, %xmm0
2096 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2097 ; X86-SSE2: # %bb.0:
2098 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
2099 ; X86-SSE2-NEXT: psrld $4, %xmm1
2100 ; X86-SSE2-NEXT: pslld $28, %xmm0
2101 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2102 ; X86-SSE2-NEXT: retl
2103 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2107 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x) nounwind {
2108 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2110 ; SSE-NEXT: movdqa %xmm0, %xmm1
2111 ; SSE-NEXT: psrlw $7, %xmm1
2112 ; SSE-NEXT: psllw $9, %xmm0
2113 ; SSE-NEXT: por %xmm1, %xmm0
2116 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2118 ; AVX-NEXT: vpsrlw $7, %xmm0, %xmm1
2119 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2120 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2123 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2125 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1
2126 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2127 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2128 ; AVX512F-NEXT: retq
2130 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2131 ; AVX512VL: # %bb.0:
2132 ; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm1
2133 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2134 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2135 ; AVX512VL-NEXT: retq
2137 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2138 ; AVX512BW: # %bb.0:
2139 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1
2140 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2141 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2142 ; AVX512BW-NEXT: retq
2144 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2145 ; AVX512VLBW: # %bb.0:
2146 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm0, %xmm1
2147 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2148 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2149 ; AVX512VLBW-NEXT: retq
2151 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2152 ; AVX512VBMI2: # %bb.0:
2153 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2154 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm0, %zmm0
2155 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2156 ; AVX512VBMI2-NEXT: vzeroupper
2157 ; AVX512VBMI2-NEXT: retq
2159 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2160 ; AVX512VLVBMI2: # %bb.0:
2161 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm0, %xmm0
2162 ; AVX512VLVBMI2-NEXT: retq
2164 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2166 ; XOP-NEXT: vprotw $9, %xmm0, %xmm0
2169 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2170 ; X86-SSE2: # %bb.0:
2171 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
2172 ; X86-SSE2-NEXT: psrlw $7, %xmm1
2173 ; X86-SSE2-NEXT: psllw $9, %xmm0
2174 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2175 ; X86-SSE2-NEXT: retl
2176 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %x, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2180 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
2181 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2183 ; SSE-NEXT: movdqa %xmm0, %xmm1
2184 ; SSE-NEXT: psrlw $4, %xmm1
2185 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2186 ; SSE-NEXT: psllw $4, %xmm0
2187 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2188 ; SSE-NEXT: por %xmm1, %xmm0
2191 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2193 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
2194 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2195 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2196 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2197 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2200 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2202 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1
2203 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2204 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
2205 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2206 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2207 ; AVX512F-NEXT: retq
2209 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2210 ; AVX512VL: # %bb.0:
2211 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm1
2212 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm0
2213 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2214 ; AVX512VL-NEXT: retq
2216 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2217 ; AVX512BW: # %bb.0:
2218 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1
2219 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2220 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
2221 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2222 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2223 ; AVX512BW-NEXT: retq
2225 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2226 ; AVX512VLBW: # %bb.0:
2227 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm1
2228 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm0
2229 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2230 ; AVX512VLBW-NEXT: retq
2232 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2233 ; AVX512VBMI2: # %bb.0:
2234 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
2235 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2236 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
2237 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2238 ; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2239 ; AVX512VBMI2-NEXT: retq
2241 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2242 ; AVX512VLVBMI2: # %bb.0:
2243 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1
2244 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
2245 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2246 ; AVX512VLVBMI2-NEXT: retq
2248 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2250 ; XOP-NEXT: vprotb $4, %xmm0, %xmm0
2253 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2254 ; X86-SSE2: # %bb.0:
2255 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
2256 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2257 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2258 ; X86-SSE2-NEXT: psllw $4, %xmm0
2259 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2260 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2261 ; X86-SSE2-NEXT: retl
2262 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %x, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)