1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
18 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
27 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28 ; SSE2-LABEL: var_funnnel_v2i64:
30 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
31 ; SSE2-NEXT: movdqa %xmm2, %xmm4
32 ; SSE2-NEXT: pand %xmm3, %xmm4
33 ; SSE2-NEXT: movdqa %xmm1, %xmm5
34 ; SSE2-NEXT: psrlq %xmm4, %xmm5
35 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
36 ; SSE2-NEXT: psrlq %xmm4, %xmm1
37 ; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
38 ; SSE2-NEXT: pandn %xmm3, %xmm2
39 ; SSE2-NEXT: paddq %xmm0, %xmm0
40 ; SSE2-NEXT: movdqa %xmm0, %xmm1
41 ; SSE2-NEXT: psllq %xmm2, %xmm1
42 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43 ; SSE2-NEXT: psllq %xmm2, %xmm0
44 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
45 ; SSE2-NEXT: orpd %xmm5, %xmm0
48 ; SSE41-LABEL: var_funnnel_v2i64:
50 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
51 ; SSE41-NEXT: movdqa %xmm2, %xmm4
52 ; SSE41-NEXT: pand %xmm3, %xmm4
53 ; SSE41-NEXT: movdqa %xmm1, %xmm5
54 ; SSE41-NEXT: psrlq %xmm4, %xmm5
55 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
56 ; SSE41-NEXT: psrlq %xmm4, %xmm1
57 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
58 ; SSE41-NEXT: pandn %xmm3, %xmm2
59 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
60 ; SSE41-NEXT: paddq %xmm0, %xmm0
61 ; SSE41-NEXT: movdqa %xmm0, %xmm3
62 ; SSE41-NEXT: psllq %xmm1, %xmm3
63 ; SSE41-NEXT: psllq %xmm2, %xmm0
64 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
65 ; SSE41-NEXT: por %xmm5, %xmm0
68 ; AVX1-LABEL: var_funnnel_v2i64:
70 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
71 ; AVX1-NEXT: # xmm3 = mem[0,0]
72 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
73 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
74 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
75 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
76 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
77 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
78 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
79 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
80 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
81 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
82 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
83 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
86 ; AVX2-LABEL: var_funnnel_v2i64:
88 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
89 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
90 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
91 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
92 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
93 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
94 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
97 ; AVX512F-LABEL: var_funnnel_v2i64:
99 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
100 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
101 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
102 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
103 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
104 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
105 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
108 ; AVX512VL-LABEL: var_funnnel_v2i64:
110 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
111 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
112 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
113 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
114 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
115 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
116 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
117 ; AVX512VL-NEXT: retq
119 ; AVX512BW-LABEL: var_funnnel_v2i64:
121 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
122 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
123 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
124 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
125 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
126 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
127 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
128 ; AVX512BW-NEXT: retq
130 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
131 ; AVX512VBMI2: # %bb.0:
132 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
133 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
134 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
135 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
136 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
137 ; AVX512VBMI2-NEXT: vzeroupper
138 ; AVX512VBMI2-NEXT: retq
140 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
141 ; AVX512VLBW: # %bb.0:
142 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
143 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
144 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
145 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
146 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
147 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
148 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
149 ; AVX512VLBW-NEXT: retq
151 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
152 ; AVX512VLVBMI2: # %bb.0:
153 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
154 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
155 ; AVX512VLVBMI2-NEXT: retq
157 ; XOPAVX1-LABEL: var_funnnel_v2i64:
159 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
160 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
161 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
162 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
163 ; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
164 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
165 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
166 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
167 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
168 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
171 ; XOPAVX2-LABEL: var_funnnel_v2i64:
173 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
174 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
175 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
176 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
177 ; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
178 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
179 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
182 ; X86-SSE2-LABEL: var_funnnel_v2i64:
184 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
185 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
186 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
187 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
188 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
189 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
190 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
191 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
192 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
193 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
194 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
195 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
196 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
197 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
198 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
199 ; X86-SSE2-NEXT: orpd %xmm3, %xmm0
200 ; X86-SSE2-NEXT: retl
201 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
205 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
206 ; SSE2-LABEL: var_funnnel_v4i32:
208 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
209 ; SSE2-NEXT: movdqa %xmm2, %xmm5
210 ; SSE2-NEXT: pand %xmm4, %xmm5
211 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
212 ; SSE2-NEXT: movdqa %xmm1, %xmm6
213 ; SSE2-NEXT: psrld %xmm3, %xmm6
214 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
215 ; SSE2-NEXT: movdqa %xmm1, %xmm3
216 ; SSE2-NEXT: psrld %xmm7, %xmm3
217 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
218 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
219 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
220 ; SSE2-NEXT: movdqa %xmm1, %xmm7
221 ; SSE2-NEXT: psrld %xmm6, %xmm7
222 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
223 ; SSE2-NEXT: psrld %xmm5, %xmm1
224 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
225 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
226 ; SSE2-NEXT: pandn %xmm4, %xmm2
227 ; SSE2-NEXT: pslld $23, %xmm2
228 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
229 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
230 ; SSE2-NEXT: paddd %xmm0, %xmm0
231 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
232 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
233 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
234 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
235 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
236 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
237 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238 ; SSE2-NEXT: por %xmm3, %xmm0
241 ; SSE41-LABEL: var_funnnel_v4i32:
243 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
244 ; SSE41-NEXT: movdqa %xmm2, %xmm4
245 ; SSE41-NEXT: pand %xmm3, %xmm4
246 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
247 ; SSE41-NEXT: movdqa %xmm1, %xmm6
248 ; SSE41-NEXT: psrld %xmm5, %xmm6
249 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
250 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
251 ; SSE41-NEXT: movdqa %xmm1, %xmm8
252 ; SSE41-NEXT: psrld %xmm7, %xmm8
253 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
254 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
255 ; SSE41-NEXT: movdqa %xmm1, %xmm6
256 ; SSE41-NEXT: psrld %xmm4, %xmm6
257 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
258 ; SSE41-NEXT: psrld %xmm4, %xmm1
259 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
260 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
261 ; SSE41-NEXT: pandn %xmm3, %xmm2
262 ; SSE41-NEXT: pslld $23, %xmm2
263 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
264 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
265 ; SSE41-NEXT: paddd %xmm0, %xmm0
266 ; SSE41-NEXT: pmulld %xmm1, %xmm0
267 ; SSE41-NEXT: por %xmm6, %xmm0
270 ; AVX1-LABEL: var_funnnel_v4i32:
272 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
273 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
274 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
275 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
276 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
277 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
278 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
279 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
280 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
281 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
282 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
283 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
284 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
285 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
286 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
287 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
288 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
289 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
290 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
291 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
292 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
295 ; AVX2-LABEL: var_funnnel_v4i32:
297 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
298 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
299 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
300 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
301 ; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
302 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
303 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
306 ; AVX512F-LABEL: var_funnnel_v4i32:
308 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
309 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
310 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
311 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
312 ; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
313 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
314 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
317 ; AVX512VL-LABEL: var_funnnel_v4i32:
319 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
320 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
321 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
323 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0
324 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
325 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
326 ; AVX512VL-NEXT: retq
328 ; AVX512BW-LABEL: var_funnnel_v4i32:
330 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
331 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
332 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
333 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
334 ; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
335 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
336 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
337 ; AVX512BW-NEXT: retq
339 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
340 ; AVX512VBMI2: # %bb.0:
341 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
342 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
343 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
344 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
345 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
346 ; AVX512VBMI2-NEXT: vzeroupper
347 ; AVX512VBMI2-NEXT: retq
349 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
350 ; AVX512VLBW: # %bb.0:
351 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
352 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
353 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
354 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
355 ; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
356 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
357 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
358 ; AVX512VLBW-NEXT: retq
360 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
361 ; AVX512VLVBMI2: # %bb.0:
362 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
363 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
364 ; AVX512VLVBMI2-NEXT: retq
366 ; XOPAVX1-LABEL: var_funnnel_v4i32:
368 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
369 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
370 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
371 ; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
372 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
373 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
374 ; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
375 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
376 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
379 ; XOPAVX2-LABEL: var_funnnel_v4i32:
381 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
382 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
383 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
384 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
385 ; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
386 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
387 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
390 ; X86-SSE2-LABEL: var_funnnel_v4i32:
392 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
393 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
394 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
395 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
396 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
397 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
398 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
399 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
400 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
401 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
402 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
403 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
404 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
405 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
406 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
407 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
408 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
409 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
410 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
411 ; X86-SSE2-NEXT: pslld $23, %xmm2
412 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
413 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
414 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0
415 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
416 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
417 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
418 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
419 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
420 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
421 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
422 ; X86-SSE2-NEXT: por %xmm3, %xmm0
423 ; X86-SSE2-NEXT: retl
424 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
428 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
429 ; SSE2-LABEL: var_funnnel_v8i16:
431 ; SSE2-NEXT: movdqa %xmm2, %xmm4
432 ; SSE2-NEXT: psllw $12, %xmm4
433 ; SSE2-NEXT: movdqa %xmm4, %xmm3
434 ; SSE2-NEXT: psraw $15, %xmm3
435 ; SSE2-NEXT: movdqa %xmm3, %xmm5
436 ; SSE2-NEXT: pandn %xmm1, %xmm5
437 ; SSE2-NEXT: psrlw $8, %xmm1
438 ; SSE2-NEXT: pand %xmm1, %xmm3
439 ; SSE2-NEXT: por %xmm5, %xmm3
440 ; SSE2-NEXT: paddw %xmm4, %xmm4
441 ; SSE2-NEXT: movdqa %xmm4, %xmm1
442 ; SSE2-NEXT: psraw $15, %xmm1
443 ; SSE2-NEXT: movdqa %xmm1, %xmm5
444 ; SSE2-NEXT: pandn %xmm3, %xmm5
445 ; SSE2-NEXT: psrlw $4, %xmm3
446 ; SSE2-NEXT: pand %xmm1, %xmm3
447 ; SSE2-NEXT: por %xmm5, %xmm3
448 ; SSE2-NEXT: paddw %xmm4, %xmm4
449 ; SSE2-NEXT: movdqa %xmm4, %xmm1
450 ; SSE2-NEXT: psraw $15, %xmm1
451 ; SSE2-NEXT: movdqa %xmm1, %xmm5
452 ; SSE2-NEXT: pandn %xmm3, %xmm5
453 ; SSE2-NEXT: psrlw $2, %xmm3
454 ; SSE2-NEXT: pand %xmm1, %xmm3
455 ; SSE2-NEXT: por %xmm5, %xmm3
456 ; SSE2-NEXT: paddw %xmm4, %xmm4
457 ; SSE2-NEXT: psraw $15, %xmm4
458 ; SSE2-NEXT: movdqa %xmm4, %xmm1
459 ; SSE2-NEXT: pandn %xmm3, %xmm1
460 ; SSE2-NEXT: psrlw $1, %xmm3
461 ; SSE2-NEXT: pand %xmm4, %xmm3
462 ; SSE2-NEXT: por %xmm1, %xmm3
463 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
464 ; SSE2-NEXT: movdqa %xmm2, %xmm1
465 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
466 ; SSE2-NEXT: pslld $23, %xmm1
467 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
468 ; SSE2-NEXT: paddd %xmm4, %xmm1
469 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
470 ; SSE2-NEXT: pslld $16, %xmm1
471 ; SSE2-NEXT: psrad $16, %xmm1
472 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
473 ; SSE2-NEXT: pslld $23, %xmm2
474 ; SSE2-NEXT: paddd %xmm4, %xmm2
475 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
476 ; SSE2-NEXT: pslld $16, %xmm2
477 ; SSE2-NEXT: psrad $16, %xmm2
478 ; SSE2-NEXT: packssdw %xmm1, %xmm2
479 ; SSE2-NEXT: paddw %xmm0, %xmm0
480 ; SSE2-NEXT: pmullw %xmm2, %xmm0
481 ; SSE2-NEXT: por %xmm3, %xmm0
484 ; SSE41-LABEL: var_funnnel_v8i16:
486 ; SSE41-NEXT: movdqa %xmm0, %xmm3
487 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
488 ; SSE41-NEXT: movdqa %xmm2, %xmm0
489 ; SSE41-NEXT: pand %xmm5, %xmm0
490 ; SSE41-NEXT: movdqa %xmm0, %xmm4
491 ; SSE41-NEXT: psllw $12, %xmm4
492 ; SSE41-NEXT: psllw $4, %xmm0
493 ; SSE41-NEXT: por %xmm4, %xmm0
494 ; SSE41-NEXT: movdqa %xmm0, %xmm4
495 ; SSE41-NEXT: paddw %xmm0, %xmm4
496 ; SSE41-NEXT: movdqa %xmm1, %xmm6
497 ; SSE41-NEXT: psrlw $8, %xmm6
498 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
499 ; SSE41-NEXT: movdqa %xmm1, %xmm6
500 ; SSE41-NEXT: psrlw $4, %xmm6
501 ; SSE41-NEXT: movdqa %xmm4, %xmm0
502 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
503 ; SSE41-NEXT: movdqa %xmm1, %xmm6
504 ; SSE41-NEXT: psrlw $2, %xmm6
505 ; SSE41-NEXT: paddw %xmm4, %xmm4
506 ; SSE41-NEXT: movdqa %xmm4, %xmm0
507 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
508 ; SSE41-NEXT: movdqa %xmm1, %xmm6
509 ; SSE41-NEXT: psrlw $1, %xmm6
510 ; SSE41-NEXT: paddw %xmm4, %xmm4
511 ; SSE41-NEXT: movdqa %xmm4, %xmm0
512 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
513 ; SSE41-NEXT: pandn %xmm5, %xmm2
514 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
515 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
516 ; SSE41-NEXT: pslld $23, %xmm2
517 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
518 ; SSE41-NEXT: paddd %xmm4, %xmm2
519 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
520 ; SSE41-NEXT: pslld $23, %xmm0
521 ; SSE41-NEXT: paddd %xmm4, %xmm0
522 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
523 ; SSE41-NEXT: packusdw %xmm2, %xmm0
524 ; SSE41-NEXT: paddw %xmm3, %xmm3
525 ; SSE41-NEXT: pmullw %xmm0, %xmm3
526 ; SSE41-NEXT: por %xmm1, %xmm3
527 ; SSE41-NEXT: movdqa %xmm3, %xmm0
530 ; AVX1-LABEL: var_funnnel_v8i16:
532 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
533 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
534 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
535 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
536 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
537 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
538 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
539 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
540 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
541 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
542 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
543 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
544 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
545 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
546 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
547 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
548 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
549 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
550 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
551 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
552 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
553 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
554 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
555 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
556 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
557 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
558 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
559 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
560 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
561 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
564 ; AVX2-LABEL: var_funnnel_v8i16:
566 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
567 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
568 ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
569 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
570 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
571 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
572 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
573 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
574 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
575 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
576 ; AVX2-NEXT: vzeroupper
579 ; AVX512F-LABEL: var_funnnel_v8i16:
581 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
582 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
583 ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0
584 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
585 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
586 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
587 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
588 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
589 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
590 ; AVX512F-NEXT: vzeroupper
593 ; AVX512VL-LABEL: var_funnnel_v8i16:
595 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
596 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
597 ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0
598 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
599 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
600 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
601 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
602 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
603 ; AVX512VL-NEXT: vzeroupper
604 ; AVX512VL-NEXT: retq
606 ; AVX512BW-LABEL: var_funnnel_v8i16:
608 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
609 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
610 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
611 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
612 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
613 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
614 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
615 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
616 ; AVX512BW-NEXT: vzeroupper
617 ; AVX512BW-NEXT: retq
619 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
620 ; AVX512VBMI2: # %bb.0:
621 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
622 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
623 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
624 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
625 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
626 ; AVX512VBMI2-NEXT: vzeroupper
627 ; AVX512VBMI2-NEXT: retq
629 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
630 ; AVX512VLBW: # %bb.0:
631 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
632 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
633 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
634 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
635 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
636 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
637 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
638 ; AVX512VLBW-NEXT: retq
640 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
641 ; AVX512VLVBMI2: # %bb.0:
642 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
643 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
644 ; AVX512VLVBMI2-NEXT: retq
646 ; XOPAVX1-LABEL: var_funnnel_v8i16:
648 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
649 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
650 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
651 ; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4
652 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
653 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
654 ; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
655 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
656 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
659 ; XOPAVX2-LABEL: var_funnnel_v8i16:
661 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
662 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
663 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
664 ; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
665 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
666 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
667 ; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
668 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
669 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
672 ; X86-SSE2-LABEL: var_funnnel_v8i16:
674 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
675 ; X86-SSE2-NEXT: psllw $12, %xmm4
676 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
677 ; X86-SSE2-NEXT: psraw $15, %xmm3
678 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
679 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
680 ; X86-SSE2-NEXT: psrlw $8, %xmm1
681 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
682 ; X86-SSE2-NEXT: por %xmm5, %xmm3
683 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
684 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
685 ; X86-SSE2-NEXT: psraw $15, %xmm1
686 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
687 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
688 ; X86-SSE2-NEXT: psrlw $4, %xmm3
689 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
690 ; X86-SSE2-NEXT: por %xmm5, %xmm3
691 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
692 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
693 ; X86-SSE2-NEXT: psraw $15, %xmm1
694 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
695 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
696 ; X86-SSE2-NEXT: psrlw $2, %xmm3
697 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
698 ; X86-SSE2-NEXT: por %xmm5, %xmm3
699 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
700 ; X86-SSE2-NEXT: psraw $15, %xmm4
701 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
702 ; X86-SSE2-NEXT: pandn %xmm3, %xmm1
703 ; X86-SSE2-NEXT: psrlw $1, %xmm3
704 ; X86-SSE2-NEXT: pand %xmm4, %xmm3
705 ; X86-SSE2-NEXT: por %xmm1, %xmm3
706 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
707 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
708 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
709 ; X86-SSE2-NEXT: pslld $23, %xmm1
710 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
711 ; X86-SSE2-NEXT: paddd %xmm4, %xmm1
712 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
713 ; X86-SSE2-NEXT: pslld $16, %xmm1
714 ; X86-SSE2-NEXT: psrad $16, %xmm1
715 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
716 ; X86-SSE2-NEXT: pslld $23, %xmm2
717 ; X86-SSE2-NEXT: paddd %xmm4, %xmm2
718 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
719 ; X86-SSE2-NEXT: pslld $16, %xmm2
720 ; X86-SSE2-NEXT: psrad $16, %xmm2
721 ; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
722 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
723 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
724 ; X86-SSE2-NEXT: por %xmm3, %xmm0
725 ; X86-SSE2-NEXT: retl
726 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
730 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
731 ; SSE2-LABEL: var_funnnel_v16i8:
733 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
734 ; SSE2-NEXT: movdqa %xmm2, %xmm6
735 ; SSE2-NEXT: pand %xmm5, %xmm6
736 ; SSE2-NEXT: psllw $5, %xmm6
737 ; SSE2-NEXT: pxor %xmm4, %xmm4
738 ; SSE2-NEXT: pxor %xmm3, %xmm3
739 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm3
740 ; SSE2-NEXT: movdqa %xmm3, %xmm7
741 ; SSE2-NEXT: pandn %xmm1, %xmm7
742 ; SSE2-NEXT: psrlw $4, %xmm1
743 ; SSE2-NEXT: pand %xmm1, %xmm3
744 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
745 ; SSE2-NEXT: por %xmm7, %xmm3
746 ; SSE2-NEXT: paddb %xmm6, %xmm6
747 ; SSE2-NEXT: pxor %xmm1, %xmm1
748 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
749 ; SSE2-NEXT: movdqa %xmm1, %xmm7
750 ; SSE2-NEXT: pandn %xmm3, %xmm7
751 ; SSE2-NEXT: psrlw $2, %xmm3
752 ; SSE2-NEXT: pand %xmm1, %xmm3
753 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
754 ; SSE2-NEXT: por %xmm7, %xmm3
755 ; SSE2-NEXT: paddb %xmm6, %xmm6
756 ; SSE2-NEXT: pxor %xmm1, %xmm1
757 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
758 ; SSE2-NEXT: movdqa %xmm1, %xmm6
759 ; SSE2-NEXT: pandn %xmm3, %xmm6
760 ; SSE2-NEXT: psrlw $1, %xmm3
761 ; SSE2-NEXT: pand %xmm1, %xmm3
762 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
763 ; SSE2-NEXT: por %xmm6, %xmm3
764 ; SSE2-NEXT: pandn %xmm5, %xmm2
765 ; SSE2-NEXT: psllw $5, %xmm2
766 ; SSE2-NEXT: pxor %xmm1, %xmm1
767 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
768 ; SSE2-NEXT: paddb %xmm0, %xmm0
769 ; SSE2-NEXT: movdqa %xmm1, %xmm5
770 ; SSE2-NEXT: pandn %xmm0, %xmm5
771 ; SSE2-NEXT: psllw $4, %xmm0
772 ; SSE2-NEXT: pand %xmm1, %xmm0
773 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
774 ; SSE2-NEXT: por %xmm5, %xmm0
775 ; SSE2-NEXT: paddb %xmm2, %xmm2
776 ; SSE2-NEXT: pxor %xmm1, %xmm1
777 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
778 ; SSE2-NEXT: movdqa %xmm1, %xmm5
779 ; SSE2-NEXT: pandn %xmm0, %xmm5
780 ; SSE2-NEXT: psllw $2, %xmm0
781 ; SSE2-NEXT: pand %xmm1, %xmm0
782 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
783 ; SSE2-NEXT: por %xmm5, %xmm0
784 ; SSE2-NEXT: paddb %xmm2, %xmm2
785 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
786 ; SSE2-NEXT: movdqa %xmm4, %xmm1
787 ; SSE2-NEXT: pandn %xmm0, %xmm1
788 ; SSE2-NEXT: paddb %xmm0, %xmm0
789 ; SSE2-NEXT: pand %xmm4, %xmm0
790 ; SSE2-NEXT: por %xmm1, %xmm0
791 ; SSE2-NEXT: por %xmm3, %xmm0
794 ; SSE41-LABEL: var_funnnel_v16i8:
796 ; SSE41-NEXT: movdqa %xmm2, %xmm3
797 ; SSE41-NEXT: movdqa %xmm0, %xmm2
798 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
799 ; SSE41-NEXT: movdqa %xmm3, %xmm0
800 ; SSE41-NEXT: pand %xmm5, %xmm0
801 ; SSE41-NEXT: psllw $5, %xmm0
802 ; SSE41-NEXT: movdqa %xmm0, %xmm4
803 ; SSE41-NEXT: paddb %xmm0, %xmm4
804 ; SSE41-NEXT: movdqa %xmm1, %xmm6
805 ; SSE41-NEXT: psrlw $4, %xmm6
806 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
807 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
808 ; SSE41-NEXT: movdqa %xmm1, %xmm6
809 ; SSE41-NEXT: psrlw $2, %xmm6
810 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
811 ; SSE41-NEXT: movdqa %xmm4, %xmm0
812 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
813 ; SSE41-NEXT: movdqa %xmm1, %xmm6
814 ; SSE41-NEXT: psrlw $1, %xmm6
815 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
816 ; SSE41-NEXT: paddb %xmm4, %xmm4
817 ; SSE41-NEXT: movdqa %xmm4, %xmm0
818 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
819 ; SSE41-NEXT: pandn %xmm5, %xmm3
820 ; SSE41-NEXT: psllw $5, %xmm3
821 ; SSE41-NEXT: movdqa %xmm3, %xmm4
822 ; SSE41-NEXT: paddb %xmm3, %xmm4
823 ; SSE41-NEXT: paddb %xmm2, %xmm2
824 ; SSE41-NEXT: movdqa %xmm2, %xmm5
825 ; SSE41-NEXT: psllw $4, %xmm5
826 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
827 ; SSE41-NEXT: movdqa %xmm3, %xmm0
828 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
829 ; SSE41-NEXT: movdqa %xmm2, %xmm3
830 ; SSE41-NEXT: psllw $2, %xmm3
831 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
832 ; SSE41-NEXT: movdqa %xmm4, %xmm0
833 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
834 ; SSE41-NEXT: movdqa %xmm2, %xmm3
835 ; SSE41-NEXT: paddb %xmm2, %xmm3
836 ; SSE41-NEXT: paddb %xmm4, %xmm4
837 ; SSE41-NEXT: movdqa %xmm4, %xmm0
838 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
839 ; SSE41-NEXT: por %xmm1, %xmm2
840 ; SSE41-NEXT: movdqa %xmm2, %xmm0
843 ; AVX1-LABEL: var_funnnel_v16i8:
845 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
846 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
847 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
848 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
849 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
850 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
851 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
852 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
853 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
854 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
855 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
856 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
857 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
858 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
859 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
860 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
861 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
862 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
863 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
864 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
865 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
866 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
867 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
868 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
869 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
870 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
871 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
872 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
875 ; AVX2-LABEL: var_funnnel_v16i8:
877 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
878 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
879 ; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4
880 ; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
881 ; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
882 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
883 ; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
884 ; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
885 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
886 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
887 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
888 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
889 ; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
890 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
891 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
892 ; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2
893 ; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
894 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
895 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4
896 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
897 ; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
898 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
899 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
900 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
901 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
902 ; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
903 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
904 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
907 ; AVX512F-LABEL: var_funnnel_v16i8:
909 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
910 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
911 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
912 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
913 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
914 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
915 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
916 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
917 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
918 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
919 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
920 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
921 ; AVX512F-NEXT: vzeroupper
924 ; AVX512VL-LABEL: var_funnnel_v16i8:
926 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
927 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
928 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
929 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
930 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
931 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
932 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
933 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
934 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
935 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
936 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
937 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
938 ; AVX512VL-NEXT: vzeroupper
939 ; AVX512VL-NEXT: retq
941 ; AVX512BW-LABEL: var_funnnel_v16i8:
943 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
944 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
945 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
946 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
947 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
948 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
949 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
950 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
951 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
952 ; AVX512BW-NEXT: vzeroupper
953 ; AVX512BW-NEXT: retq
955 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
956 ; AVX512VBMI2: # %bb.0:
957 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
958 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
959 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
960 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1
961 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
962 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
963 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
964 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
965 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
966 ; AVX512VBMI2-NEXT: vzeroupper
967 ; AVX512VBMI2-NEXT: retq
969 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
970 ; AVX512VLBW: # %bb.0:
971 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
972 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
973 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
974 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
975 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
976 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
977 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
978 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
979 ; AVX512VLBW-NEXT: vzeroupper
980 ; AVX512VLBW-NEXT: retq
982 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
983 ; AVX512VLVBMI2: # %bb.0:
984 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
985 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
986 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
987 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
988 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
989 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
990 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0
991 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
992 ; AVX512VLVBMI2-NEXT: vzeroupper
993 ; AVX512VLVBMI2-NEXT: retq
995 ; XOPAVX1-LABEL: var_funnnel_v16i8:
997 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
998 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
999 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
1000 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
1001 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
1002 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1003 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1004 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1005 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1006 ; XOPAVX1-NEXT: retq
1008 ; XOPAVX2-LABEL: var_funnnel_v16i8:
1010 ; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1011 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1012 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
1013 ; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
1014 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
1015 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1016 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1017 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1018 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1019 ; XOPAVX2-NEXT: retq
1021 ; X86-SSE2-LABEL: var_funnnel_v16i8:
1022 ; X86-SSE2: # %bb.0:
1023 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1024 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
1025 ; X86-SSE2-NEXT: pand %xmm5, %xmm6
1026 ; X86-SSE2-NEXT: psllw $5, %xmm6
1027 ; X86-SSE2-NEXT: pxor %xmm4, %xmm4
1028 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
1029 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
1030 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
1031 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7
1032 ; X86-SSE2-NEXT: psrlw $4, %xmm1
1033 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1034 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1035 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1036 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1037 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1038 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1039 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
1040 ; X86-SSE2-NEXT: pandn %xmm3, %xmm7
1041 ; X86-SSE2-NEXT: psrlw $2, %xmm3
1042 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1043 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1044 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1045 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1046 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1047 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1048 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
1049 ; X86-SSE2-NEXT: pandn %xmm3, %xmm6
1050 ; X86-SSE2-NEXT: psrlw $1, %xmm3
1051 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1052 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1053 ; X86-SSE2-NEXT: por %xmm6, %xmm3
1054 ; X86-SSE2-NEXT: pandn %xmm5, %xmm2
1055 ; X86-SSE2-NEXT: psllw $5, %xmm2
1056 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1057 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1058 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1059 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1060 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1061 ; X86-SSE2-NEXT: psllw $4, %xmm0
1062 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1063 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1064 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1065 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1066 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1067 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1068 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1069 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1070 ; X86-SSE2-NEXT: psllw $2, %xmm0
1071 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1072 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1073 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1074 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1075 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1076 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
1077 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1078 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1079 ; X86-SSE2-NEXT: pand %xmm4, %xmm0
1080 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1081 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1082 ; X86-SSE2-NEXT: retl
1083 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1088 ; Uniform Variable Shifts
1091 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1092 ; SSE-LABEL: splatvar_funnnel_v2i64:
1094 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
1095 ; SSE-NEXT: movdqa %xmm2, %xmm4
1096 ; SSE-NEXT: pand %xmm3, %xmm4
1097 ; SSE-NEXT: psrlq %xmm4, %xmm1
1098 ; SSE-NEXT: pandn %xmm3, %xmm2
1099 ; SSE-NEXT: paddq %xmm0, %xmm0
1100 ; SSE-NEXT: psllq %xmm2, %xmm0
1101 ; SSE-NEXT: por %xmm1, %xmm0
1104 ; AVX1-LABEL: splatvar_funnnel_v2i64:
1106 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
1107 ; AVX1-NEXT: # xmm3 = mem[0,0]
1108 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1109 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1110 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1111 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1112 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1113 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1116 ; AVX2-LABEL: splatvar_funnnel_v2i64:
1118 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1119 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1120 ; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1121 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1122 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1123 ; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1124 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1127 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1129 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1130 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1131 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1132 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1133 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1134 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1135 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1136 ; AVX512F-NEXT: retq
1138 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1139 ; AVX512VL: # %bb.0:
1140 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1141 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1142 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1143 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1144 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1145 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1146 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1147 ; AVX512VL-NEXT: retq
1149 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1150 ; AVX512BW: # %bb.0:
1151 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1152 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1153 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1154 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1155 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1156 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1157 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1158 ; AVX512BW-NEXT: retq
1160 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1161 ; AVX512VBMI2: # %bb.0:
1162 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1163 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1164 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1165 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1166 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1167 ; AVX512VBMI2-NEXT: vzeroupper
1168 ; AVX512VBMI2-NEXT: retq
1170 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1171 ; AVX512VLBW: # %bb.0:
1172 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1173 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1174 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1175 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1176 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1177 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1178 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1179 ; AVX512VLBW-NEXT: retq
1181 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1182 ; AVX512VLVBMI2: # %bb.0:
1183 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1184 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1185 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1186 ; AVX512VLVBMI2-NEXT: retq
1188 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1190 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
1191 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
1192 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1193 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1194 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1195 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1196 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1197 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1198 ; XOPAVX1-NEXT: retq
1200 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1202 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1203 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1204 ; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1205 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1206 ; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1207 ; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1208 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1209 ; XOPAVX2-NEXT: retq
1211 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1212 ; X86-SSE2: # %bb.0:
1213 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
1214 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1215 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1216 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1217 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1218 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
1219 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1220 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1221 ; X86-SSE2-NEXT: retl
1222 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1223 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1227 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1228 ; SSE-LABEL: splatvar_funnnel_v4i32:
1230 ; SSE-NEXT: movdqa %xmm1, %xmm3
1231 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1232 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1233 ; SSE-NEXT: psrlq %xmm2, %xmm3
1234 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1235 ; SSE-NEXT: psrlq %xmm2, %xmm1
1236 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1237 ; SSE-NEXT: movaps %xmm1, %xmm0
1240 ; AVX-LABEL: splatvar_funnnel_v4i32:
1242 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1243 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1244 ; AVX-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1245 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1246 ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1247 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1250 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1252 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1253 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1254 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1255 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1256 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1257 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1258 ; AVX512F-NEXT: retq
1260 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1261 ; AVX512VL: # %bb.0:
1262 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1263 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1264 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1265 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1266 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1267 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1268 ; AVX512VL-NEXT: vzeroupper
1269 ; AVX512VL-NEXT: retq
1271 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1272 ; AVX512BW: # %bb.0:
1273 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1274 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1275 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1276 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1277 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1278 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1279 ; AVX512BW-NEXT: retq
1281 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1282 ; AVX512VBMI2: # %bb.0:
1283 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1284 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1285 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1286 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1287 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1288 ; AVX512VBMI2-NEXT: vzeroupper
1289 ; AVX512VBMI2-NEXT: retq
1291 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1292 ; AVX512VLBW: # %bb.0:
1293 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1294 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1295 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1296 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1297 ; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1298 ; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0
1299 ; AVX512VLBW-NEXT: vzeroupper
1300 ; AVX512VLBW-NEXT: retq
1302 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1303 ; AVX512VLVBMI2: # %bb.0:
1304 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1305 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1306 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1307 ; AVX512VLVBMI2-NEXT: retq
1309 ; XOP-LABEL: splatvar_funnnel_v4i32:
1311 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1312 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1313 ; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1314 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1315 ; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1316 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1319 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1320 ; X86-SSE2: # %bb.0:
1321 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1322 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1323 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1324 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm3
1325 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1326 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm1
1327 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1328 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
1329 ; X86-SSE2-NEXT: retl
1330 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1331 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1335 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1336 ; SSE-LABEL: splatvar_funnnel_v8i16:
1338 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1339 ; SSE-NEXT: movdqa %xmm2, %xmm4
1340 ; SSE-NEXT: pand %xmm3, %xmm4
1341 ; SSE-NEXT: psrlw %xmm4, %xmm1
1342 ; SSE-NEXT: pandn %xmm3, %xmm2
1343 ; SSE-NEXT: paddw %xmm0, %xmm0
1344 ; SSE-NEXT: psllw %xmm2, %xmm0
1345 ; SSE-NEXT: por %xmm1, %xmm0
1348 ; AVX-LABEL: splatvar_funnnel_v8i16:
1350 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1351 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1352 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1353 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1354 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1355 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1356 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1359 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1361 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1362 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1363 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1364 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1365 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1366 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1367 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1368 ; AVX512F-NEXT: retq
1370 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1371 ; AVX512VL: # %bb.0:
1372 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1373 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1374 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1375 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1376 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1377 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1378 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1379 ; AVX512VL-NEXT: retq
1381 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1382 ; AVX512BW: # %bb.0:
1383 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1384 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1385 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1386 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1387 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1388 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1389 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1390 ; AVX512BW-NEXT: retq
1392 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1393 ; AVX512VBMI2: # %bb.0:
1394 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1395 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1396 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1397 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1398 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1399 ; AVX512VBMI2-NEXT: vzeroupper
1400 ; AVX512VBMI2-NEXT: retq
1402 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1403 ; AVX512VLBW: # %bb.0:
1404 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1405 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1406 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1407 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1408 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1409 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1410 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1411 ; AVX512VLBW-NEXT: retq
1413 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1414 ; AVX512VLVBMI2: # %bb.0:
1415 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1416 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1417 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1418 ; AVX512VLVBMI2-NEXT: retq
1420 ; XOP-LABEL: splatvar_funnnel_v8i16:
1422 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1423 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1424 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1425 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1426 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1427 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1428 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1431 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1432 ; X86-SSE2: # %bb.0:
1433 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1434 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1435 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1436 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
1437 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1438 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1439 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1440 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1441 ; X86-SSE2-NEXT: retl
1442 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1443 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1447 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1448 ; SSE-LABEL: splatvar_funnnel_v16i8:
1450 ; SSE-NEXT: movdqa %xmm1, %xmm4
1451 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1452 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1453 ; SSE-NEXT: psrlw %xmm2, %xmm4
1454 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1455 ; SSE-NEXT: pand %xmm3, %xmm4
1456 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1457 ; SSE-NEXT: psrlw %xmm2, %xmm1
1458 ; SSE-NEXT: pand %xmm1, %xmm3
1459 ; SSE-NEXT: packuswb %xmm4, %xmm3
1460 ; SSE-NEXT: movdqa %xmm3, %xmm0
1463 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1465 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1466 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1467 ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1468 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1469 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1470 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1471 ; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1472 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1473 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1476 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1478 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1479 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1480 ; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1481 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1482 ; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
1483 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1484 ; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1485 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1486 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1489 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1491 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1492 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1493 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1494 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1495 ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3
1496 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1497 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1498 ; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0
1499 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1500 ; AVX512F-NEXT: retq
1502 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1503 ; AVX512VL: # %bb.0:
1504 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1505 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1506 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1507 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1508 ; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
1509 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1510 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1511 ; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0
1512 ; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1513 ; AVX512VL-NEXT: retq
1515 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1516 ; AVX512BW: # %bb.0:
1517 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1518 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1519 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1520 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1521 ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3
1522 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1523 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1524 ; AVX512BW-NEXT: vpand %xmm4, %xmm0, %xmm0
1525 ; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1526 ; AVX512BW-NEXT: retq
1528 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1529 ; AVX512VBMI2: # %bb.0:
1530 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78]
1531 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1532 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1533 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
1534 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1535 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1536 ; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
1537 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1538 ; AVX512VBMI2-NEXT: vzeroupper
1539 ; AVX512VBMI2-NEXT: retq
1541 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1542 ; AVX512VLBW: # %bb.0:
1543 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1544 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1545 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1546 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1547 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1548 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1549 ; AVX512VLBW-NEXT: vzeroupper
1550 ; AVX512VLBW-NEXT: retq
1552 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1553 ; AVX512VLVBMI2: # %bb.0:
1554 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1555 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1556 ; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1557 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1558 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1559 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1560 ; AVX512VLVBMI2-NEXT: vzeroupper
1561 ; AVX512VLVBMI2-NEXT: retq
1563 ; XOP-LABEL: splatvar_funnnel_v16i8:
1565 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1566 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1567 ; XOP-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1568 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1569 ; XOP-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1570 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
1573 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1574 ; X86-SSE2: # %bb.0:
1575 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
1576 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1577 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1578 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm4
1579 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1580 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1581 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1582 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1
1583 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1584 ; X86-SSE2-NEXT: packuswb %xmm4, %xmm3
1585 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
1586 ; X86-SSE2-NEXT: retl
1587 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1588 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1596 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1597 ; SSE2-LABEL: constant_funnnel_v2i64:
1599 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1600 ; SSE2-NEXT: psrlq $4, %xmm2
1601 ; SSE2-NEXT: psrlq $14, %xmm1
1602 ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1603 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1604 ; SSE2-NEXT: psllq $60, %xmm1
1605 ; SSE2-NEXT: psllq $50, %xmm0
1606 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1607 ; SSE2-NEXT: orpd %xmm2, %xmm0
1610 ; SSE41-LABEL: constant_funnnel_v2i64:
1612 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1613 ; SSE41-NEXT: psrlq $14, %xmm2
1614 ; SSE41-NEXT: psrlq $4, %xmm1
1615 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1616 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1617 ; SSE41-NEXT: psllq $50, %xmm1
1618 ; SSE41-NEXT: psllq $60, %xmm0
1619 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1620 ; SSE41-NEXT: por %xmm2, %xmm0
1623 ; AVX1-LABEL: constant_funnnel_v2i64:
1625 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
1626 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
1627 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1628 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
1629 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
1630 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1631 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1634 ; AVX2-LABEL: constant_funnnel_v2i64:
1636 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1637 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1638 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1641 ; AVX512F-LABEL: constant_funnnel_v2i64:
1643 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1644 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1645 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1646 ; AVX512F-NEXT: retq
1648 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1649 ; AVX512VL: # %bb.0:
1650 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1651 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1652 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1653 ; AVX512VL-NEXT: retq
1655 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1656 ; AVX512BW: # %bb.0:
1657 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1658 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1659 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1660 ; AVX512BW-NEXT: retq
1662 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1663 ; AVX512VBMI2: # %bb.0:
1664 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1665 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1666 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14]
1667 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1668 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1669 ; AVX512VBMI2-NEXT: vzeroupper
1670 ; AVX512VBMI2-NEXT: retq
1672 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1673 ; AVX512VLBW: # %bb.0:
1674 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1675 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1676 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1677 ; AVX512VLBW-NEXT: retq
1679 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1680 ; AVX512VLVBMI2: # %bb.0:
1681 ; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1682 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1683 ; AVX512VLVBMI2-NEXT: retq
1685 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
1687 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1688 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1689 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1690 ; XOPAVX1-NEXT: retq
1692 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
1694 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1695 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1696 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1697 ; XOPAVX2-NEXT: retq
1699 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1700 ; X86-SSE2: # %bb.0:
1701 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1702 ; X86-SSE2-NEXT: psrlq $4, %xmm2
1703 ; X86-SSE2-NEXT: psrlq $14, %xmm1
1704 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1705 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1706 ; X86-SSE2-NEXT: psllq $60, %xmm1
1707 ; X86-SSE2-NEXT: psllq $50, %xmm0
1708 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1709 ; X86-SSE2-NEXT: orpd %xmm2, %xmm0
1710 ; X86-SSE2-NEXT: retl
1711 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1715 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1716 ; SSE2-LABEL: constant_funnnel_v4i32:
1718 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1719 ; SSE2-NEXT: psrld $7, %xmm2
1720 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1721 ; SSE2-NEXT: psrld $6, %xmm3
1722 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1723 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1724 ; SSE2-NEXT: psrld $5, %xmm2
1725 ; SSE2-NEXT: psrld $4, %xmm1
1726 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1727 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1728 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1729 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1730 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1731 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1732 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1733 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1734 ; SSE2-NEXT: por %xmm1, %xmm0
1737 ; SSE41-LABEL: constant_funnnel_v4i32:
1739 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1740 ; SSE41-NEXT: psrld $7, %xmm2
1741 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1742 ; SSE41-NEXT: psrld $5, %xmm3
1743 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1744 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1745 ; SSE41-NEXT: psrld $6, %xmm2
1746 ; SSE41-NEXT: psrld $4, %xmm1
1747 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1748 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1749 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1750 ; SSE41-NEXT: por %xmm2, %xmm0
1753 ; AVX1-LABEL: constant_funnnel_v4i32:
1755 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
1756 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
1757 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1758 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
1759 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
1760 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1761 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1762 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1763 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1766 ; AVX2-LABEL: constant_funnnel_v4i32:
1768 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1769 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1770 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1773 ; AVX512F-LABEL: constant_funnnel_v4i32:
1775 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1776 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1777 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1778 ; AVX512F-NEXT: retq
1780 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1781 ; AVX512VL: # %bb.0:
1782 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1783 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1784 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1785 ; AVX512VL-NEXT: retq
1787 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1788 ; AVX512BW: # %bb.0:
1789 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1790 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1791 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1792 ; AVX512BW-NEXT: retq
1794 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1795 ; AVX512VBMI2: # %bb.0:
1796 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1797 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1798 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
1799 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1800 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1801 ; AVX512VBMI2-NEXT: vzeroupper
1802 ; AVX512VBMI2-NEXT: retq
1804 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1805 ; AVX512VLBW: # %bb.0:
1806 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1807 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1808 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1809 ; AVX512VLBW-NEXT: retq
1811 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1812 ; AVX512VLVBMI2: # %bb.0:
1813 ; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1814 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1815 ; AVX512VLVBMI2-NEXT: retq
1817 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
1819 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1820 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1821 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1822 ; XOPAVX1-NEXT: retq
1824 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
1826 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1827 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1828 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1829 ; XOPAVX2-NEXT: retq
1831 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
1832 ; X86-SSE2: # %bb.0:
1833 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1834 ; X86-SSE2-NEXT: psrld $7, %xmm2
1835 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1836 ; X86-SSE2-NEXT: psrld $6, %xmm3
1837 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1838 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1839 ; X86-SSE2-NEXT: psrld $5, %xmm2
1840 ; X86-SSE2-NEXT: psrld $4, %xmm1
1841 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1842 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1843 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1844 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1845 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1846 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1847 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1848 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1849 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1850 ; X86-SSE2-NEXT: retl
1851 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1855 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1856 ; SSE2-LABEL: constant_funnnel_v8i16:
1858 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1859 ; SSE2-NEXT: pandn %xmm1, %xmm2
1860 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1861 ; SSE2-NEXT: por %xmm1, %xmm2
1862 ; SSE2-NEXT: paddw %xmm0, %xmm0
1863 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1864 ; SSE2-NEXT: por %xmm2, %xmm0
1867 ; SSE41-LABEL: constant_funnnel_v8i16:
1869 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
1870 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
1871 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1872 ; SSE41-NEXT: paddw %xmm0, %xmm0
1873 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1874 ; SSE41-NEXT: por %xmm2, %xmm0
1877 ; AVX-LABEL: constant_funnnel_v8i16:
1879 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1880 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1881 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1882 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1883 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1886 ; AVX512F-LABEL: constant_funnnel_v8i16:
1888 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1889 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1890 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1891 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1892 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1893 ; AVX512F-NEXT: retq
1895 ; AVX512VL-LABEL: constant_funnnel_v8i16:
1896 ; AVX512VL: # %bb.0:
1897 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1898 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1899 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1900 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1901 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1902 ; AVX512VL-NEXT: retq
1904 ; AVX512BW-LABEL: constant_funnnel_v8i16:
1905 ; AVX512BW: # %bb.0:
1906 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1907 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1908 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1909 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1910 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1911 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1912 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1913 ; AVX512BW-NEXT: vzeroupper
1914 ; AVX512BW-NEXT: retq
1916 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1917 ; AVX512VBMI2: # %bb.0:
1918 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1919 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1920 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1921 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1922 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1923 ; AVX512VBMI2-NEXT: vzeroupper
1924 ; AVX512VBMI2-NEXT: retq
1926 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
1927 ; AVX512VLBW: # %bb.0:
1928 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1929 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1930 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1931 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1932 ; AVX512VLBW-NEXT: retq
1934 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
1935 ; AVX512VLVBMI2: # %bb.0:
1936 ; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1937 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1938 ; AVX512VLVBMI2-NEXT: retq
1940 ; XOP-LABEL: constant_funnnel_v8i16:
1942 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1943 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1944 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1945 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1948 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
1949 ; X86-SSE2: # %bb.0:
1950 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1951 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1952 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1953 ; X86-SSE2-NEXT: por %xmm1, %xmm2
1954 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1955 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1956 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1957 ; X86-SSE2-NEXT: retl
1958 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
1962 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
1963 ; SSE2-LABEL: constant_funnnel_v16i8:
1965 ; SSE2-NEXT: pxor %xmm2, %xmm2
1966 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1967 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1968 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1969 ; SSE2-NEXT: psrlw $8, %xmm3
1970 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1971 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1972 ; SSE2-NEXT: psrlw $8, %xmm1
1973 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1974 ; SSE2-NEXT: paddb %xmm0, %xmm0
1975 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1976 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1977 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1978 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1979 ; SSE2-NEXT: pand %xmm3, %xmm2
1980 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1981 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1982 ; SSE2-NEXT: pand %xmm3, %xmm0
1983 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1984 ; SSE2-NEXT: por %xmm1, %xmm0
1987 ; SSE41-LABEL: constant_funnnel_v16i8:
1989 ; SSE41-NEXT: paddb %xmm0, %xmm0
1990 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1991 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1992 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1993 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1994 ; SSE41-NEXT: pand %xmm3, %xmm0
1995 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1996 ; SSE41-NEXT: pand %xmm3, %xmm2
1997 ; SSE41-NEXT: packuswb %xmm0, %xmm2
1998 ; SSE41-NEXT: pxor %xmm3, %xmm3
1999 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2000 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
2001 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2002 ; SSE41-NEXT: psrlw $8, %xmm1
2003 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2004 ; SSE41-NEXT: psrlw $8, %xmm0
2005 ; SSE41-NEXT: packuswb %xmm1, %xmm0
2006 ; SSE41-NEXT: por %xmm2, %xmm0
2009 ; AVX1-LABEL: constant_funnnel_v16i8:
2011 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2012 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2013 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2014 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2015 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2016 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2017 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2018 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2019 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2020 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2021 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2022 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2023 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2024 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2025 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2026 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2027 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2028 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2031 ; AVX2-LABEL: constant_funnnel_v16i8:
2033 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2034 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2035 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2036 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2037 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2038 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2039 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2040 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2041 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2042 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2043 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2044 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2045 ; AVX2-NEXT: vzeroupper
2048 ; AVX512F-LABEL: constant_funnnel_v16i8:
2050 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2051 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2052 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2053 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2054 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2055 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
2056 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2057 ; AVX512F-NEXT: vzeroupper
2058 ; AVX512F-NEXT: retq
2060 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2061 ; AVX512VL: # %bb.0:
2062 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2063 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2064 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2065 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2066 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2067 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
2068 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2069 ; AVX512VL-NEXT: vzeroupper
2070 ; AVX512VL-NEXT: retq
2072 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2073 ; AVX512BW: # %bb.0:
2074 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2075 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2076 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2077 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
2078 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2079 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
2080 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2081 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2082 ; AVX512BW-NEXT: vzeroupper
2083 ; AVX512BW-NEXT: retq
2085 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2086 ; AVX512VBMI2: # %bb.0:
2087 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2088 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2089 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2090 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1
2091 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2092 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
2093 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2094 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2095 ; AVX512VBMI2-NEXT: vzeroupper
2096 ; AVX512VBMI2-NEXT: retq
2098 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2099 ; AVX512VLBW: # %bb.0:
2100 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2101 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2102 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
2103 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2104 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2105 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2106 ; AVX512VLBW-NEXT: vzeroupper
2107 ; AVX512VLBW-NEXT: retq
2109 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2110 ; AVX512VLVBMI2: # %bb.0:
2111 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2112 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2113 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2114 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
2115 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2116 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2117 ; AVX512VLVBMI2-NEXT: vzeroupper
2118 ; AVX512VLVBMI2-NEXT: retq
2120 ; XOP-LABEL: constant_funnnel_v16i8:
2122 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2123 ; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2124 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2125 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2128 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
2129 ; X86-SSE2: # %bb.0:
2130 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
2131 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
2132 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2133 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
2134 ; X86-SSE2-NEXT: psrlw $8, %xmm3
2135 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2136 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2137 ; X86-SSE2-NEXT: psrlw $8, %xmm1
2138 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
2139 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
2140 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
2141 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2142 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
2143 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2144 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
2145 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2146 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2147 ; X86-SSE2-NEXT: pand %xmm3, %xmm0
2148 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
2149 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2150 ; X86-SSE2-NEXT: retl
2151 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2156 ; Uniform Constant Shifts
2159 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2160 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2162 ; SSE-NEXT: psrlq $14, %xmm1
2163 ; SSE-NEXT: psllq $50, %xmm0
2164 ; SSE-NEXT: por %xmm1, %xmm0
2167 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2169 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2170 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2171 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2174 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2176 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2177 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2178 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2179 ; AVX512F-NEXT: retq
2181 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2182 ; AVX512VL: # %bb.0:
2183 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2184 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2185 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2186 ; AVX512VL-NEXT: retq
2188 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2189 ; AVX512BW: # %bb.0:
2190 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2191 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2192 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2193 ; AVX512BW-NEXT: retq
2195 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2196 ; AVX512VBMI2: # %bb.0:
2197 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2198 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2199 ; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
2200 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2201 ; AVX512VBMI2-NEXT: vzeroupper
2202 ; AVX512VBMI2-NEXT: retq
2204 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2205 ; AVX512VLBW: # %bb.0:
2206 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2207 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2208 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2209 ; AVX512VLBW-NEXT: retq
2211 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2212 ; AVX512VLVBMI2: # %bb.0:
2213 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2214 ; AVX512VLVBMI2-NEXT: retq
2216 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2218 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2219 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2220 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2223 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2224 ; X86-SSE2: # %bb.0:
2225 ; X86-SSE2-NEXT: psrlq $14, %xmm1
2226 ; X86-SSE2-NEXT: psllq $50, %xmm0
2227 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2228 ; X86-SSE2-NEXT: retl
2229 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2233 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2234 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2236 ; SSE-NEXT: psrld $4, %xmm1
2237 ; SSE-NEXT: pslld $28, %xmm0
2238 ; SSE-NEXT: por %xmm1, %xmm0
2241 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2243 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2244 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2245 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2248 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2250 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2251 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2252 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2253 ; AVX512F-NEXT: retq
2255 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2256 ; AVX512VL: # %bb.0:
2257 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2258 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2259 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2260 ; AVX512VL-NEXT: retq
2262 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2263 ; AVX512BW: # %bb.0:
2264 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2265 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2266 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2267 ; AVX512BW-NEXT: retq
2269 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2270 ; AVX512VBMI2: # %bb.0:
2271 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2272 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2273 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
2274 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2275 ; AVX512VBMI2-NEXT: vzeroupper
2276 ; AVX512VBMI2-NEXT: retq
2278 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2279 ; AVX512VLBW: # %bb.0:
2280 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2281 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2282 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2283 ; AVX512VLBW-NEXT: retq
2285 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2286 ; AVX512VLVBMI2: # %bb.0:
2287 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2288 ; AVX512VLVBMI2-NEXT: retq
2290 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2292 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2293 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2294 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2297 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2298 ; X86-SSE2: # %bb.0:
2299 ; X86-SSE2-NEXT: psrld $4, %xmm1
2300 ; X86-SSE2-NEXT: pslld $28, %xmm0
2301 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2302 ; X86-SSE2-NEXT: retl
2303 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2307 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2308 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2310 ; SSE-NEXT: psrlw $7, %xmm1
2311 ; SSE-NEXT: psllw $9, %xmm0
2312 ; SSE-NEXT: por %xmm1, %xmm0
2315 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2317 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2318 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2319 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2322 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2324 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2325 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2326 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2327 ; AVX512F-NEXT: retq
2329 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2330 ; AVX512VL: # %bb.0:
2331 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2332 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2333 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2334 ; AVX512VL-NEXT: retq
2336 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2337 ; AVX512BW: # %bb.0:
2338 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2339 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2340 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2341 ; AVX512BW-NEXT: retq
2343 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2344 ; AVX512VBMI2: # %bb.0:
2345 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2346 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2347 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
2348 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2349 ; AVX512VBMI2-NEXT: vzeroupper
2350 ; AVX512VBMI2-NEXT: retq
2352 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2353 ; AVX512VLBW: # %bb.0:
2354 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2355 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2356 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2357 ; AVX512VLBW-NEXT: retq
2359 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2360 ; AVX512VLVBMI2: # %bb.0:
2361 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
2362 ; AVX512VLVBMI2-NEXT: retq
2364 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2366 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2367 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2368 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2371 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2372 ; X86-SSE2: # %bb.0:
2373 ; X86-SSE2-NEXT: psrlw $7, %xmm1
2374 ; X86-SSE2-NEXT: psllw $9, %xmm0
2375 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2376 ; X86-SSE2-NEXT: retl
2377 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2381 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2382 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2384 ; SSE-NEXT: psrlw $4, %xmm1
2385 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2386 ; SSE-NEXT: psllw $4, %xmm0
2387 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2388 ; SSE-NEXT: por %xmm1, %xmm0
2391 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2393 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2394 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2395 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2396 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2397 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2400 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2402 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2
2403 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0
2404 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2405 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2406 ; AVX512F-NEXT: vzeroupper
2407 ; AVX512F-NEXT: retq
2409 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2410 ; AVX512VL: # %bb.0:
2411 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2412 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2413 ; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2414 ; AVX512VL-NEXT: retq
2416 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2417 ; AVX512BW: # %bb.0:
2418 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2
2419 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0
2420 ; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2421 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2422 ; AVX512BW-NEXT: vzeroupper
2423 ; AVX512BW-NEXT: retq
2425 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2426 ; AVX512VBMI2: # %bb.0:
2427 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2428 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2429 ; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0
2430 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2431 ; AVX512VBMI2-NEXT: vzeroupper
2432 ; AVX512VBMI2-NEXT: retq
2434 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2435 ; AVX512VLBW: # %bb.0:
2436 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
2437 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
2438 ; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2439 ; AVX512VLBW-NEXT: retq
2441 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2442 ; AVX512VLVBMI2: # %bb.0:
2443 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2444 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2445 ; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0
2446 ; AVX512VLVBMI2-NEXT: retq
2448 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2450 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2451 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2452 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2455 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2456 ; X86-SSE2: # %bb.0:
2457 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2458 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2459 ; X86-SSE2-NEXT: psllw $4, %xmm0
2460 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2461 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2462 ; X86-SSE2-NEXT: retl
2463 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)