1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
17 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
18 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
20 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
21 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
22 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
23 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
29 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
30 ; SSE2-LABEL: var_funnnel_v2i64:
32 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
33 ; SSE2-NEXT: movdqa %xmm2, %xmm4
34 ; SSE2-NEXT: pand %xmm3, %xmm4
35 ; SSE2-NEXT: movdqa %xmm1, %xmm5
36 ; SSE2-NEXT: psrlq %xmm4, %xmm5
37 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
38 ; SSE2-NEXT: psrlq %xmm4, %xmm1
39 ; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
40 ; SSE2-NEXT: pandn %xmm3, %xmm2
41 ; SSE2-NEXT: paddq %xmm0, %xmm0
42 ; SSE2-NEXT: movdqa %xmm0, %xmm1
43 ; SSE2-NEXT: psllq %xmm2, %xmm1
44 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
45 ; SSE2-NEXT: psllq %xmm2, %xmm0
46 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
47 ; SSE2-NEXT: orpd %xmm5, %xmm0
50 ; SSE41-LABEL: var_funnnel_v2i64:
52 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
53 ; SSE41-NEXT: movdqa %xmm2, %xmm4
54 ; SSE41-NEXT: pand %xmm3, %xmm4
55 ; SSE41-NEXT: movdqa %xmm1, %xmm5
56 ; SSE41-NEXT: psrlq %xmm4, %xmm5
57 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
58 ; SSE41-NEXT: psrlq %xmm4, %xmm1
59 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
60 ; SSE41-NEXT: pandn %xmm3, %xmm2
61 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
62 ; SSE41-NEXT: paddq %xmm0, %xmm0
63 ; SSE41-NEXT: movdqa %xmm0, %xmm3
64 ; SSE41-NEXT: psllq %xmm1, %xmm3
65 ; SSE41-NEXT: psllq %xmm2, %xmm0
66 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
67 ; SSE41-NEXT: por %xmm5, %xmm0
70 ; AVX1-LABEL: var_funnnel_v2i64:
72 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
73 ; AVX1-NEXT: # xmm3 = mem[0,0]
74 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
75 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
76 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
77 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
78 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
79 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
80 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
81 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
82 ; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
83 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
84 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
85 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
88 ; AVX2-LABEL: var_funnnel_v2i64:
90 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
91 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
92 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
93 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
94 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
95 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
96 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
99 ; AVX512F-LABEL: var_funnnel_v2i64:
101 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
102 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
103 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
104 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
105 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
106 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
107 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
110 ; AVX512VL-LABEL: var_funnnel_v2i64:
112 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
113 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
114 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
115 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
116 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
117 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
118 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
119 ; AVX512VL-NEXT: retq
121 ; AVX512BW-LABEL: var_funnnel_v2i64:
123 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
124 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
125 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
126 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
127 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
128 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
129 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
130 ; AVX512BW-NEXT: retq
132 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
133 ; AVX512VBMI2: # %bb.0:
134 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
135 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
136 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
137 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
138 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
139 ; AVX512VBMI2-NEXT: vzeroupper
140 ; AVX512VBMI2-NEXT: retq
142 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
143 ; AVX512VLBW: # %bb.0:
144 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
145 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
146 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
147 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
148 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
149 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
150 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
151 ; AVX512VLBW-NEXT: retq
153 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
154 ; AVX512VLVBMI2: # %bb.0:
155 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
156 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
157 ; AVX512VLVBMI2-NEXT: retq
159 ; XOPAVX1-LABEL: var_funnnel_v2i64:
161 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
162 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
163 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
164 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
165 ; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
166 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
167 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
168 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
169 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
170 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
173 ; XOPAVX2-LABEL: var_funnnel_v2i64:
175 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
176 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
177 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
178 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
179 ; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
180 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
181 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
184 ; X86-SSE2-LABEL: var_funnnel_v2i64:
186 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
187 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
188 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
189 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
190 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
191 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
192 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
193 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
194 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
195 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
196 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
197 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
198 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
199 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
200 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
201 ; X86-SSE2-NEXT: orpd %xmm3, %xmm0
202 ; X86-SSE2-NEXT: retl
203 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
207 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
208 ; SSE2-LABEL: var_funnnel_v4i32:
210 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
211 ; SSE2-NEXT: movdqa %xmm2, %xmm5
212 ; SSE2-NEXT: pand %xmm4, %xmm5
213 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
214 ; SSE2-NEXT: movdqa %xmm1, %xmm6
215 ; SSE2-NEXT: psrld %xmm3, %xmm6
216 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
217 ; SSE2-NEXT: movdqa %xmm1, %xmm3
218 ; SSE2-NEXT: psrld %xmm7, %xmm3
219 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
220 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
221 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
222 ; SSE2-NEXT: movdqa %xmm1, %xmm7
223 ; SSE2-NEXT: psrld %xmm6, %xmm7
224 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
225 ; SSE2-NEXT: psrld %xmm5, %xmm1
226 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
227 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
228 ; SSE2-NEXT: pandn %xmm4, %xmm2
229 ; SSE2-NEXT: pslld $23, %xmm2
230 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
231 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
232 ; SSE2-NEXT: paddd %xmm0, %xmm0
233 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
234 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
235 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
236 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
237 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
238 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
239 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
240 ; SSE2-NEXT: por %xmm3, %xmm0
243 ; SSE41-LABEL: var_funnnel_v4i32:
245 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31]
246 ; SSE41-NEXT: movdqa %xmm2, %xmm4
247 ; SSE41-NEXT: pand %xmm3, %xmm4
248 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
249 ; SSE41-NEXT: movdqa %xmm1, %xmm6
250 ; SSE41-NEXT: psrld %xmm5, %xmm6
251 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
252 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
253 ; SSE41-NEXT: movdqa %xmm1, %xmm8
254 ; SSE41-NEXT: psrld %xmm7, %xmm8
255 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
256 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
257 ; SSE41-NEXT: movdqa %xmm1, %xmm6
258 ; SSE41-NEXT: psrld %xmm4, %xmm6
259 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
260 ; SSE41-NEXT: psrld %xmm4, %xmm1
261 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
262 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
263 ; SSE41-NEXT: pandn %xmm3, %xmm2
264 ; SSE41-NEXT: pslld $23, %xmm2
265 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
266 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
267 ; SSE41-NEXT: paddd %xmm0, %xmm0
268 ; SSE41-NEXT: pmulld %xmm1, %xmm0
269 ; SSE41-NEXT: por %xmm6, %xmm0
272 ; AVX1-LABEL: var_funnnel_v4i32:
274 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
275 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
276 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
277 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
278 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
279 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
280 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
281 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
282 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
283 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
284 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
285 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
286 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
287 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
288 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
289 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
290 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
291 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
292 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
293 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
294 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
297 ; AVX2-LABEL: var_funnnel_v4i32:
299 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
300 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
301 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
302 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
303 ; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
304 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
305 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
308 ; AVX512F-LABEL: var_funnnel_v4i32:
310 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
311 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
312 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
313 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
314 ; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
315 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
316 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
319 ; AVX512VL-LABEL: var_funnnel_v4i32:
321 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
322 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
323 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
324 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
325 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0
326 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
327 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
328 ; AVX512VL-NEXT: retq
330 ; AVX512BW-LABEL: var_funnnel_v4i32:
332 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
333 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
334 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
335 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
336 ; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
337 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
338 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
339 ; AVX512BW-NEXT: retq
341 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
342 ; AVX512VBMI2: # %bb.0:
343 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
344 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
345 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
346 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
347 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
348 ; AVX512VBMI2-NEXT: vzeroupper
349 ; AVX512VBMI2-NEXT: retq
351 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
352 ; AVX512VLBW: # %bb.0:
353 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
354 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
355 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
356 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
357 ; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
358 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
359 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
360 ; AVX512VLBW-NEXT: retq
362 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
363 ; AVX512VLVBMI2: # %bb.0:
364 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
365 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
366 ; AVX512VLVBMI2-NEXT: retq
368 ; XOPAVX1-LABEL: var_funnnel_v4i32:
370 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
371 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
372 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
373 ; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
374 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
375 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
376 ; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
377 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
378 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
381 ; XOPAVX2-LABEL: var_funnnel_v4i32:
383 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
384 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
385 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
386 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
387 ; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
388 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
389 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
392 ; X86-SSE2-LABEL: var_funnnel_v4i32:
394 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
395 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
396 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
397 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
398 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
399 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
400 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
401 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
402 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
403 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
404 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
405 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
406 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
407 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
408 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
409 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
410 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
411 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
412 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
413 ; X86-SSE2-NEXT: pslld $23, %xmm2
414 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
415 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
416 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0
417 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
418 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
419 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
420 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
421 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
422 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
423 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
424 ; X86-SSE2-NEXT: por %xmm3, %xmm0
425 ; X86-SSE2-NEXT: retl
426 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
430 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
431 ; SSE2-LABEL: var_funnnel_v8i16:
433 ; SSE2-NEXT: movdqa %xmm2, %xmm4
434 ; SSE2-NEXT: psllw $12, %xmm4
435 ; SSE2-NEXT: movdqa %xmm4, %xmm3
436 ; SSE2-NEXT: psraw $15, %xmm3
437 ; SSE2-NEXT: movdqa %xmm3, %xmm5
438 ; SSE2-NEXT: pandn %xmm1, %xmm5
439 ; SSE2-NEXT: psrlw $8, %xmm1
440 ; SSE2-NEXT: pand %xmm1, %xmm3
441 ; SSE2-NEXT: por %xmm5, %xmm3
442 ; SSE2-NEXT: paddw %xmm4, %xmm4
443 ; SSE2-NEXT: movdqa %xmm4, %xmm1
444 ; SSE2-NEXT: psraw $15, %xmm1
445 ; SSE2-NEXT: movdqa %xmm1, %xmm5
446 ; SSE2-NEXT: pandn %xmm3, %xmm5
447 ; SSE2-NEXT: psrlw $4, %xmm3
448 ; SSE2-NEXT: pand %xmm1, %xmm3
449 ; SSE2-NEXT: por %xmm5, %xmm3
450 ; SSE2-NEXT: paddw %xmm4, %xmm4
451 ; SSE2-NEXT: movdqa %xmm4, %xmm1
452 ; SSE2-NEXT: psraw $15, %xmm1
453 ; SSE2-NEXT: movdqa %xmm1, %xmm5
454 ; SSE2-NEXT: pandn %xmm3, %xmm5
455 ; SSE2-NEXT: psrlw $2, %xmm3
456 ; SSE2-NEXT: pand %xmm1, %xmm3
457 ; SSE2-NEXT: por %xmm5, %xmm3
458 ; SSE2-NEXT: paddw %xmm4, %xmm4
459 ; SSE2-NEXT: psraw $15, %xmm4
460 ; SSE2-NEXT: movdqa %xmm4, %xmm1
461 ; SSE2-NEXT: pandn %xmm3, %xmm1
462 ; SSE2-NEXT: psrlw $1, %xmm3
463 ; SSE2-NEXT: pand %xmm4, %xmm3
464 ; SSE2-NEXT: por %xmm1, %xmm3
465 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
466 ; SSE2-NEXT: movdqa %xmm2, %xmm1
467 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
468 ; SSE2-NEXT: pslld $23, %xmm1
469 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
470 ; SSE2-NEXT: paddd %xmm4, %xmm1
471 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
472 ; SSE2-NEXT: pslld $16, %xmm1
473 ; SSE2-NEXT: psrad $16, %xmm1
474 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
475 ; SSE2-NEXT: pslld $23, %xmm2
476 ; SSE2-NEXT: paddd %xmm4, %xmm2
477 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
478 ; SSE2-NEXT: pslld $16, %xmm2
479 ; SSE2-NEXT: psrad $16, %xmm2
480 ; SSE2-NEXT: packssdw %xmm1, %xmm2
481 ; SSE2-NEXT: paddw %xmm0, %xmm0
482 ; SSE2-NEXT: pmullw %xmm2, %xmm0
483 ; SSE2-NEXT: por %xmm3, %xmm0
486 ; SSE41-LABEL: var_funnnel_v8i16:
488 ; SSE41-NEXT: movdqa %xmm0, %xmm3
489 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
490 ; SSE41-NEXT: movdqa %xmm2, %xmm0
491 ; SSE41-NEXT: pand %xmm5, %xmm0
492 ; SSE41-NEXT: movdqa %xmm0, %xmm4
493 ; SSE41-NEXT: psllw $12, %xmm4
494 ; SSE41-NEXT: psllw $4, %xmm0
495 ; SSE41-NEXT: por %xmm4, %xmm0
496 ; SSE41-NEXT: movdqa %xmm0, %xmm4
497 ; SSE41-NEXT: paddw %xmm0, %xmm4
498 ; SSE41-NEXT: movdqa %xmm1, %xmm6
499 ; SSE41-NEXT: psrlw $8, %xmm6
500 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
501 ; SSE41-NEXT: movdqa %xmm1, %xmm6
502 ; SSE41-NEXT: psrlw $4, %xmm6
503 ; SSE41-NEXT: movdqa %xmm4, %xmm0
504 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
505 ; SSE41-NEXT: movdqa %xmm1, %xmm6
506 ; SSE41-NEXT: psrlw $2, %xmm6
507 ; SSE41-NEXT: paddw %xmm4, %xmm4
508 ; SSE41-NEXT: movdqa %xmm4, %xmm0
509 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
510 ; SSE41-NEXT: movdqa %xmm1, %xmm6
511 ; SSE41-NEXT: psrlw $1, %xmm6
512 ; SSE41-NEXT: paddw %xmm4, %xmm4
513 ; SSE41-NEXT: movdqa %xmm4, %xmm0
514 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
515 ; SSE41-NEXT: pandn %xmm5, %xmm2
516 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
517 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
518 ; SSE41-NEXT: pslld $23, %xmm2
519 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
520 ; SSE41-NEXT: paddd %xmm4, %xmm2
521 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
522 ; SSE41-NEXT: pslld $23, %xmm0
523 ; SSE41-NEXT: paddd %xmm4, %xmm0
524 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
525 ; SSE41-NEXT: packusdw %xmm2, %xmm0
526 ; SSE41-NEXT: paddw %xmm3, %xmm3
527 ; SSE41-NEXT: pmullw %xmm0, %xmm3
528 ; SSE41-NEXT: por %xmm1, %xmm3
529 ; SSE41-NEXT: movdqa %xmm3, %xmm0
532 ; AVX1-LABEL: var_funnnel_v8i16:
534 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
535 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
536 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
537 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
538 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
539 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
540 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
541 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
542 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
543 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
544 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
545 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
546 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
547 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
548 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
549 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
550 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
551 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
552 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
553 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
554 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
555 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
556 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
557 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
558 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
559 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
560 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
561 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
562 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
563 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
566 ; AVX2-LABEL: var_funnnel_v8i16:
568 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
569 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
570 ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
571 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
572 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
573 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
574 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
575 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
576 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
577 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
578 ; AVX2-NEXT: vzeroupper
581 ; AVX512F-LABEL: var_funnnel_v8i16:
583 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
584 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
585 ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0
586 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
587 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
588 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
589 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
590 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
591 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
592 ; AVX512F-NEXT: vzeroupper
595 ; AVX512VL-LABEL: var_funnnel_v8i16:
597 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
598 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
599 ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0
600 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
601 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
602 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
603 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
604 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
605 ; AVX512VL-NEXT: vzeroupper
606 ; AVX512VL-NEXT: retq
608 ; AVX512BW-LABEL: var_funnnel_v8i16:
610 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
611 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
612 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
613 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
614 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
615 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
616 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
617 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
618 ; AVX512BW-NEXT: vzeroupper
619 ; AVX512BW-NEXT: retq
621 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
622 ; AVX512VBMI2: # %bb.0:
623 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
624 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
625 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
626 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
627 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
628 ; AVX512VBMI2-NEXT: vzeroupper
629 ; AVX512VBMI2-NEXT: retq
631 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
632 ; AVX512VLBW: # %bb.0:
633 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
634 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
635 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
636 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
637 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
638 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
639 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
640 ; AVX512VLBW-NEXT: retq
642 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
643 ; AVX512VLVBMI2: # %bb.0:
644 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
645 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
646 ; AVX512VLVBMI2-NEXT: retq
648 ; XOPAVX1-LABEL: var_funnnel_v8i16:
650 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
651 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
652 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
653 ; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4
654 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
655 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
656 ; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
657 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
658 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
661 ; XOPAVX2-LABEL: var_funnnel_v8i16:
663 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
664 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
665 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
666 ; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4
667 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
668 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
669 ; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
670 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
671 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
674 ; X86-SSE2-LABEL: var_funnnel_v8i16:
676 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
677 ; X86-SSE2-NEXT: psllw $12, %xmm4
678 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
679 ; X86-SSE2-NEXT: psraw $15, %xmm3
680 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
681 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
682 ; X86-SSE2-NEXT: psrlw $8, %xmm1
683 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
684 ; X86-SSE2-NEXT: por %xmm5, %xmm3
685 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
686 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
687 ; X86-SSE2-NEXT: psraw $15, %xmm1
688 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
689 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
690 ; X86-SSE2-NEXT: psrlw $4, %xmm3
691 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
692 ; X86-SSE2-NEXT: por %xmm5, %xmm3
693 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
694 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
695 ; X86-SSE2-NEXT: psraw $15, %xmm1
696 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
697 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
698 ; X86-SSE2-NEXT: psrlw $2, %xmm3
699 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
700 ; X86-SSE2-NEXT: por %xmm5, %xmm3
701 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
702 ; X86-SSE2-NEXT: psraw $15, %xmm4
703 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
704 ; X86-SSE2-NEXT: pandn %xmm3, %xmm1
705 ; X86-SSE2-NEXT: psrlw $1, %xmm3
706 ; X86-SSE2-NEXT: pand %xmm4, %xmm3
707 ; X86-SSE2-NEXT: por %xmm1, %xmm3
708 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
709 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
710 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
711 ; X86-SSE2-NEXT: pslld $23, %xmm1
712 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
713 ; X86-SSE2-NEXT: paddd %xmm4, %xmm1
714 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
715 ; X86-SSE2-NEXT: pslld $16, %xmm1
716 ; X86-SSE2-NEXT: psrad $16, %xmm1
717 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
718 ; X86-SSE2-NEXT: pslld $23, %xmm2
719 ; X86-SSE2-NEXT: paddd %xmm4, %xmm2
720 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
721 ; X86-SSE2-NEXT: pslld $16, %xmm2
722 ; X86-SSE2-NEXT: psrad $16, %xmm2
723 ; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
724 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
725 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
726 ; X86-SSE2-NEXT: por %xmm3, %xmm0
727 ; X86-SSE2-NEXT: retl
728 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
732 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
733 ; SSE2-LABEL: var_funnnel_v16i8:
735 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
736 ; SSE2-NEXT: movdqa %xmm2, %xmm6
737 ; SSE2-NEXT: pand %xmm5, %xmm6
738 ; SSE2-NEXT: psllw $5, %xmm6
739 ; SSE2-NEXT: pxor %xmm4, %xmm4
740 ; SSE2-NEXT: pxor %xmm3, %xmm3
741 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm3
742 ; SSE2-NEXT: movdqa %xmm3, %xmm7
743 ; SSE2-NEXT: pandn %xmm1, %xmm7
744 ; SSE2-NEXT: psrlw $4, %xmm1
745 ; SSE2-NEXT: pand %xmm1, %xmm3
746 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
747 ; SSE2-NEXT: por %xmm7, %xmm3
748 ; SSE2-NEXT: paddb %xmm6, %xmm6
749 ; SSE2-NEXT: pxor %xmm1, %xmm1
750 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
751 ; SSE2-NEXT: movdqa %xmm1, %xmm7
752 ; SSE2-NEXT: pandn %xmm3, %xmm7
753 ; SSE2-NEXT: psrlw $2, %xmm3
754 ; SSE2-NEXT: pand %xmm1, %xmm3
755 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
756 ; SSE2-NEXT: por %xmm7, %xmm3
757 ; SSE2-NEXT: paddb %xmm6, %xmm6
758 ; SSE2-NEXT: pxor %xmm1, %xmm1
759 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
760 ; SSE2-NEXT: movdqa %xmm1, %xmm6
761 ; SSE2-NEXT: pandn %xmm3, %xmm6
762 ; SSE2-NEXT: psrlw $1, %xmm3
763 ; SSE2-NEXT: pand %xmm1, %xmm3
764 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
765 ; SSE2-NEXT: por %xmm6, %xmm3
766 ; SSE2-NEXT: pandn %xmm5, %xmm2
767 ; SSE2-NEXT: psllw $5, %xmm2
768 ; SSE2-NEXT: pxor %xmm1, %xmm1
769 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
770 ; SSE2-NEXT: paddb %xmm0, %xmm0
771 ; SSE2-NEXT: movdqa %xmm1, %xmm5
772 ; SSE2-NEXT: pandn %xmm0, %xmm5
773 ; SSE2-NEXT: psllw $4, %xmm0
774 ; SSE2-NEXT: pand %xmm1, %xmm0
775 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
776 ; SSE2-NEXT: por %xmm5, %xmm0
777 ; SSE2-NEXT: paddb %xmm2, %xmm2
778 ; SSE2-NEXT: pxor %xmm1, %xmm1
779 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
780 ; SSE2-NEXT: movdqa %xmm1, %xmm5
781 ; SSE2-NEXT: pandn %xmm0, %xmm5
782 ; SSE2-NEXT: psllw $2, %xmm0
783 ; SSE2-NEXT: pand %xmm1, %xmm0
784 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
785 ; SSE2-NEXT: por %xmm5, %xmm0
786 ; SSE2-NEXT: paddb %xmm2, %xmm2
787 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
788 ; SSE2-NEXT: movdqa %xmm4, %xmm1
789 ; SSE2-NEXT: pandn %xmm0, %xmm1
790 ; SSE2-NEXT: paddb %xmm0, %xmm0
791 ; SSE2-NEXT: pand %xmm4, %xmm0
792 ; SSE2-NEXT: por %xmm1, %xmm0
793 ; SSE2-NEXT: por %xmm3, %xmm0
796 ; SSE41-LABEL: var_funnnel_v16i8:
798 ; SSE41-NEXT: movdqa %xmm2, %xmm3
799 ; SSE41-NEXT: movdqa %xmm0, %xmm2
800 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
801 ; SSE41-NEXT: movdqa %xmm3, %xmm0
802 ; SSE41-NEXT: pand %xmm5, %xmm0
803 ; SSE41-NEXT: psllw $5, %xmm0
804 ; SSE41-NEXT: movdqa %xmm0, %xmm4
805 ; SSE41-NEXT: paddb %xmm0, %xmm4
806 ; SSE41-NEXT: movdqa %xmm1, %xmm6
807 ; SSE41-NEXT: psrlw $4, %xmm6
808 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
809 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
810 ; SSE41-NEXT: movdqa %xmm1, %xmm6
811 ; SSE41-NEXT: psrlw $2, %xmm6
812 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
813 ; SSE41-NEXT: movdqa %xmm4, %xmm0
814 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
815 ; SSE41-NEXT: movdqa %xmm1, %xmm6
816 ; SSE41-NEXT: psrlw $1, %xmm6
817 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
818 ; SSE41-NEXT: paddb %xmm4, %xmm4
819 ; SSE41-NEXT: movdqa %xmm4, %xmm0
820 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
821 ; SSE41-NEXT: pandn %xmm5, %xmm3
822 ; SSE41-NEXT: psllw $5, %xmm3
823 ; SSE41-NEXT: movdqa %xmm3, %xmm4
824 ; SSE41-NEXT: paddb %xmm3, %xmm4
825 ; SSE41-NEXT: paddb %xmm2, %xmm2
826 ; SSE41-NEXT: movdqa %xmm2, %xmm5
827 ; SSE41-NEXT: psllw $4, %xmm5
828 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
829 ; SSE41-NEXT: movdqa %xmm3, %xmm0
830 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
831 ; SSE41-NEXT: movdqa %xmm2, %xmm3
832 ; SSE41-NEXT: psllw $2, %xmm3
833 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
834 ; SSE41-NEXT: movdqa %xmm4, %xmm0
835 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
836 ; SSE41-NEXT: movdqa %xmm2, %xmm3
837 ; SSE41-NEXT: paddb %xmm2, %xmm3
838 ; SSE41-NEXT: paddb %xmm4, %xmm4
839 ; SSE41-NEXT: movdqa %xmm4, %xmm0
840 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
841 ; SSE41-NEXT: por %xmm1, %xmm2
842 ; SSE41-NEXT: movdqa %xmm2, %xmm0
845 ; AVX1-LABEL: var_funnnel_v16i8:
847 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
848 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
849 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
850 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
851 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
852 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
853 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
854 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
855 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
856 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
857 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
858 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
859 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
860 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
861 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
862 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
863 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
864 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
865 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
866 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
867 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
868 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
869 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
870 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
871 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
872 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
873 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
874 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
877 ; AVX2-LABEL: var_funnnel_v16i8:
879 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
880 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
881 ; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4
882 ; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
883 ; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
884 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
885 ; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
886 ; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
887 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
888 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
889 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
890 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
891 ; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
892 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
893 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
894 ; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2
895 ; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
896 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
897 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4
898 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
899 ; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
900 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
901 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
902 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
903 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
904 ; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
905 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
906 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
909 ; AVX512F-LABEL: var_funnnel_v16i8:
911 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
912 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
913 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
914 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
915 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
916 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
917 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
918 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
919 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
920 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
921 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
922 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
923 ; AVX512F-NEXT: vzeroupper
926 ; AVX512VL-LABEL: var_funnnel_v16i8:
928 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
929 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
930 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
931 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
932 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
933 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
934 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
935 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
936 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
937 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
938 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
939 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
940 ; AVX512VL-NEXT: vzeroupper
941 ; AVX512VL-NEXT: retq
943 ; AVX512BW-LABEL: var_funnnel_v16i8:
945 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
946 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
947 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
948 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
949 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
950 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
951 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
952 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
953 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
954 ; AVX512BW-NEXT: vzeroupper
955 ; AVX512BW-NEXT: retq
957 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
958 ; AVX512VBMI2: # %bb.0:
959 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
960 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
961 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
962 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1
963 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
964 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
965 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
966 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
967 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
968 ; AVX512VBMI2-NEXT: vzeroupper
969 ; AVX512VBMI2-NEXT: retq
971 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
972 ; AVX512VLBW: # %bb.0:
973 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
974 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
975 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
976 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
977 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
978 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
979 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
980 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
981 ; AVX512VLBW-NEXT: vzeroupper
982 ; AVX512VLBW-NEXT: retq
984 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
985 ; AVX512VLVBMI2: # %bb.0:
986 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
987 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
988 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
989 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
990 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
991 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
992 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0
993 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
994 ; AVX512VLVBMI2-NEXT: vzeroupper
995 ; AVX512VLVBMI2-NEXT: retq
997 ; XOPAVX1-LABEL: var_funnnel_v16i8:
999 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1000 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1001 ; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
1002 ; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4
1003 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1
1004 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1005 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1006 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1007 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1008 ; XOPAVX1-NEXT: retq
1010 ; XOPAVX2-LABEL: var_funnnel_v16i8:
1012 ; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1013 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1014 ; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
1015 ; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4
1016 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1
1017 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1018 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1019 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0
1020 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1021 ; XOPAVX2-NEXT: retq
1023 ; X86-SSE2-LABEL: var_funnnel_v16i8:
1024 ; X86-SSE2: # %bb.0:
1025 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1026 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
1027 ; X86-SSE2-NEXT: pand %xmm5, %xmm6
1028 ; X86-SSE2-NEXT: psllw $5, %xmm6
1029 ; X86-SSE2-NEXT: pxor %xmm4, %xmm4
1030 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
1031 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
1032 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
1033 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7
1034 ; X86-SSE2-NEXT: psrlw $4, %xmm1
1035 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1036 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1037 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1038 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1039 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1040 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1041 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
1042 ; X86-SSE2-NEXT: pandn %xmm3, %xmm7
1043 ; X86-SSE2-NEXT: psrlw $2, %xmm3
1044 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1045 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1046 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1047 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1048 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1049 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1050 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
1051 ; X86-SSE2-NEXT: pandn %xmm3, %xmm6
1052 ; X86-SSE2-NEXT: psrlw $1, %xmm3
1053 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1054 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1055 ; X86-SSE2-NEXT: por %xmm6, %xmm3
1056 ; X86-SSE2-NEXT: pandn %xmm5, %xmm2
1057 ; X86-SSE2-NEXT: psllw $5, %xmm2
1058 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1059 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1060 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1061 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1062 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1063 ; X86-SSE2-NEXT: psllw $4, %xmm0
1064 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1065 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1066 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1067 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1068 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1069 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1070 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1071 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1072 ; X86-SSE2-NEXT: psllw $2, %xmm0
1073 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1074 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1075 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1076 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1077 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1078 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
1079 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1080 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1081 ; X86-SSE2-NEXT: pand %xmm4, %xmm0
1082 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1083 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1084 ; X86-SSE2-NEXT: retl
1085 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1090 ; Uniform Variable Shifts
1093 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1094 ; SSE-LABEL: splatvar_funnnel_v2i64:
1096 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
1097 ; SSE-NEXT: movdqa %xmm2, %xmm4
1098 ; SSE-NEXT: pand %xmm3, %xmm4
1099 ; SSE-NEXT: psrlq %xmm4, %xmm1
1100 ; SSE-NEXT: pandn %xmm3, %xmm2
1101 ; SSE-NEXT: paddq %xmm0, %xmm0
1102 ; SSE-NEXT: psllq %xmm2, %xmm0
1103 ; SSE-NEXT: por %xmm1, %xmm0
1106 ; AVX1-LABEL: splatvar_funnnel_v2i64:
1108 ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
1109 ; AVX1-NEXT: # xmm3 = mem[0,0]
1110 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1111 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1112 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1113 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1114 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1115 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1118 ; AVX2-LABEL: splatvar_funnnel_v2i64:
1120 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1121 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1122 ; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1123 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1124 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1125 ; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1126 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1129 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1131 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1132 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1133 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1134 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1135 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1136 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1137 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1138 ; AVX512F-NEXT: retq
1140 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1141 ; AVX512VL: # %bb.0:
1142 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1143 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1144 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1145 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1146 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1147 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1148 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1149 ; AVX512VL-NEXT: retq
1151 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1152 ; AVX512BW: # %bb.0:
1153 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1154 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1155 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1156 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1157 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1158 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1159 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1160 ; AVX512BW-NEXT: retq
1162 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1163 ; AVX512VBMI2: # %bb.0:
1164 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1165 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1166 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1167 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1168 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1169 ; AVX512VBMI2-NEXT: vzeroupper
1170 ; AVX512VBMI2-NEXT: retq
1172 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1173 ; AVX512VLBW: # %bb.0:
1174 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1175 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1176 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1177 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1178 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1179 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1180 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1181 ; AVX512VLBW-NEXT: retq
1183 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1184 ; AVX512VLVBMI2: # %bb.0:
1185 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1186 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1187 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1188 ; AVX512VLVBMI2-NEXT: retq
1190 ; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1192 ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [63,63]
1193 ; XOPAVX1-NEXT: # xmm3 = mem[0,0]
1194 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1195 ; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1196 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
1197 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1198 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1199 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1200 ; XOPAVX1-NEXT: retq
1202 ; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1204 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1205 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
1206 ; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1207 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
1208 ; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1209 ; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1210 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1211 ; XOPAVX2-NEXT: retq
1213 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1214 ; X86-SSE2: # %bb.0:
1215 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
1216 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1217 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1218 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1219 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1220 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
1221 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1222 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1223 ; X86-SSE2-NEXT: retl
1224 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1225 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1229 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1230 ; SSE-LABEL: splatvar_funnnel_v4i32:
1232 ; SSE-NEXT: movdqa %xmm1, %xmm3
1233 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1234 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1235 ; SSE-NEXT: psrlq %xmm2, %xmm3
1236 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1237 ; SSE-NEXT: psrlq %xmm2, %xmm1
1238 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1239 ; SSE-NEXT: movaps %xmm1, %xmm0
1242 ; AVX-LABEL: splatvar_funnnel_v4i32:
1244 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1245 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1246 ; AVX-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1247 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1248 ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1249 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1252 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1254 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1255 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1256 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1257 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1258 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1259 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1260 ; AVX512F-NEXT: retq
1262 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1263 ; AVX512VL: # %bb.0:
1264 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1265 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1266 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1267 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1268 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1269 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1270 ; AVX512VL-NEXT: vzeroupper
1271 ; AVX512VL-NEXT: retq
1273 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1274 ; AVX512BW: # %bb.0:
1275 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1276 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1277 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1278 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1279 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1280 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1281 ; AVX512BW-NEXT: retq
1283 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1284 ; AVX512VBMI2: # %bb.0:
1285 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1286 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1287 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1288 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1289 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1290 ; AVX512VBMI2-NEXT: vzeroupper
1291 ; AVX512VBMI2-NEXT: retq
1293 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1294 ; AVX512VLBW: # %bb.0:
1295 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1296 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1297 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1298 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1299 ; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1300 ; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0
1301 ; AVX512VLBW-NEXT: vzeroupper
1302 ; AVX512VLBW-NEXT: retq
1304 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1305 ; AVX512VLVBMI2: # %bb.0:
1306 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1307 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1308 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1309 ; AVX512VLVBMI2-NEXT: retq
1311 ; XOP-LABEL: splatvar_funnnel_v4i32:
1313 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1314 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1315 ; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1316 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1317 ; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1318 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1321 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1322 ; X86-SSE2: # %bb.0:
1323 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1324 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1325 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1326 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm3
1327 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1328 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm1
1329 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1330 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
1331 ; X86-SSE2-NEXT: retl
1332 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1333 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1337 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1338 ; SSE-LABEL: splatvar_funnnel_v8i16:
1340 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1341 ; SSE-NEXT: movdqa %xmm2, %xmm4
1342 ; SSE-NEXT: pand %xmm3, %xmm4
1343 ; SSE-NEXT: psrlw %xmm4, %xmm1
1344 ; SSE-NEXT: pandn %xmm3, %xmm2
1345 ; SSE-NEXT: paddw %xmm0, %xmm0
1346 ; SSE-NEXT: psllw %xmm2, %xmm0
1347 ; SSE-NEXT: por %xmm1, %xmm0
1350 ; AVX-LABEL: splatvar_funnnel_v8i16:
1352 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1353 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1354 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1355 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1356 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1357 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1358 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1361 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1363 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1364 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1365 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1366 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1367 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1368 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1369 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1370 ; AVX512F-NEXT: retq
1372 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1373 ; AVX512VL: # %bb.0:
1374 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1375 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1376 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1377 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1378 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1379 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1380 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1381 ; AVX512VL-NEXT: retq
1383 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1384 ; AVX512BW: # %bb.0:
1385 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1386 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1387 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1388 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1389 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1390 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1391 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1392 ; AVX512BW-NEXT: retq
1394 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1395 ; AVX512VBMI2: # %bb.0:
1396 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1397 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1398 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1399 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1400 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1401 ; AVX512VBMI2-NEXT: vzeroupper
1402 ; AVX512VBMI2-NEXT: retq
1404 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1405 ; AVX512VLBW: # %bb.0:
1406 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1407 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1408 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1409 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1410 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1411 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1412 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1413 ; AVX512VLBW-NEXT: retq
1415 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1416 ; AVX512VLVBMI2: # %bb.0:
1417 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1418 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1419 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1420 ; AVX512VLVBMI2-NEXT: retq
1422 ; XOP-LABEL: splatvar_funnnel_v8i16:
1424 ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
1425 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1426 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1427 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1428 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1429 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1430 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1433 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1434 ; X86-SSE2: # %bb.0:
1435 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0]
1436 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1437 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1438 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
1439 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1440 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1441 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1442 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1443 ; X86-SSE2-NEXT: retl
1444 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1445 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1449 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1450 ; SSE-LABEL: splatvar_funnnel_v16i8:
1452 ; SSE-NEXT: movdqa %xmm1, %xmm4
1453 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1454 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1455 ; SSE-NEXT: psrlw %xmm2, %xmm4
1456 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1457 ; SSE-NEXT: pand %xmm3, %xmm4
1458 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1459 ; SSE-NEXT: psrlw %xmm2, %xmm1
1460 ; SSE-NEXT: pand %xmm1, %xmm3
1461 ; SSE-NEXT: packuswb %xmm4, %xmm3
1462 ; SSE-NEXT: movdqa %xmm3, %xmm0
1465 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1467 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1468 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1469 ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1470 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1471 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1472 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1473 ; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1474 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1475 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1478 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1480 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1481 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1482 ; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1483 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1484 ; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
1485 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1486 ; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1487 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1488 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1491 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1493 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1494 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1495 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1496 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1497 ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3
1498 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1499 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1500 ; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0
1501 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1502 ; AVX512F-NEXT: retq
1504 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1505 ; AVX512VL: # %bb.0:
1506 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1507 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1508 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1509 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1510 ; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
1511 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1512 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1513 ; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0
1514 ; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1515 ; AVX512VL-NEXT: retq
1517 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1518 ; AVX512BW: # %bb.0:
1519 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1520 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1521 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1522 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1523 ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3
1524 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1525 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1526 ; AVX512BW-NEXT: vpand %xmm4, %xmm0, %xmm0
1527 ; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1528 ; AVX512BW-NEXT: retq
1530 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1531 ; AVX512VBMI2: # %bb.0:
1532 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78]
1533 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1534 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1535 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
1536 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1537 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1538 ; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
1539 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1540 ; AVX512VBMI2-NEXT: vzeroupper
1541 ; AVX512VBMI2-NEXT: retq
1543 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1544 ; AVX512VLBW: # %bb.0:
1545 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1546 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1547 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1548 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1549 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1550 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1551 ; AVX512VLBW-NEXT: vzeroupper
1552 ; AVX512VLBW-NEXT: retq
1554 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1555 ; AVX512VLVBMI2: # %bb.0:
1556 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1557 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1558 ; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1559 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1560 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1561 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1562 ; AVX512VLVBMI2-NEXT: vzeroupper
1563 ; AVX512VLVBMI2-NEXT: retq
1565 ; XOP-LABEL: splatvar_funnnel_v16i8:
1567 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1568 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1569 ; XOP-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1570 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1571 ; XOP-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1572 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
1575 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1576 ; X86-SSE2: # %bb.0:
1577 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
1578 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1579 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1580 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm4
1581 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1582 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1583 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1584 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1
1585 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1586 ; X86-SSE2-NEXT: packuswb %xmm4, %xmm3
1587 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
1588 ; X86-SSE2-NEXT: retl
1589 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1590 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1598 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1599 ; SSE2-LABEL: constant_funnnel_v2i64:
1601 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1602 ; SSE2-NEXT: psrlq $4, %xmm2
1603 ; SSE2-NEXT: psrlq $14, %xmm1
1604 ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1605 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1606 ; SSE2-NEXT: psllq $60, %xmm1
1607 ; SSE2-NEXT: psllq $50, %xmm0
1608 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1609 ; SSE2-NEXT: orpd %xmm2, %xmm0
1612 ; SSE41-LABEL: constant_funnnel_v2i64:
1614 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1615 ; SSE41-NEXT: psrlq $14, %xmm2
1616 ; SSE41-NEXT: psrlq $4, %xmm1
1617 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1618 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1619 ; SSE41-NEXT: psllq $50, %xmm1
1620 ; SSE41-NEXT: psllq $60, %xmm0
1621 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1622 ; SSE41-NEXT: por %xmm2, %xmm0
1625 ; AVX1-LABEL: constant_funnnel_v2i64:
1627 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
1628 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
1629 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1630 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
1631 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
1632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1633 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1636 ; AVX2-LABEL: constant_funnnel_v2i64:
1638 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1639 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1640 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1643 ; AVX512F-LABEL: constant_funnnel_v2i64:
1645 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1646 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1647 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1648 ; AVX512F-NEXT: retq
1650 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1651 ; AVX512VL: # %bb.0:
1652 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1653 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1654 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1655 ; AVX512VL-NEXT: retq
1657 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1658 ; AVX512BW: # %bb.0:
1659 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1660 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1661 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1662 ; AVX512BW-NEXT: retq
1664 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1665 ; AVX512VBMI2: # %bb.0:
1666 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1667 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1668 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,14]
1669 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1670 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1671 ; AVX512VBMI2-NEXT: vzeroupper
1672 ; AVX512VBMI2-NEXT: retq
1674 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1675 ; AVX512VLBW: # %bb.0:
1676 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1677 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1678 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1679 ; AVX512VLBW-NEXT: retq
1681 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1682 ; AVX512VLVBMI2: # %bb.0:
1683 ; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1684 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1685 ; AVX512VLVBMI2-NEXT: retq
1687 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
1689 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1690 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1691 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1692 ; XOPAVX1-NEXT: retq
1694 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
1696 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1697 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1698 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1699 ; XOPAVX2-NEXT: retq
1701 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1702 ; X86-SSE2: # %bb.0:
1703 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1704 ; X86-SSE2-NEXT: psrlq $4, %xmm2
1705 ; X86-SSE2-NEXT: psrlq $14, %xmm1
1706 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1707 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1708 ; X86-SSE2-NEXT: psllq $60, %xmm1
1709 ; X86-SSE2-NEXT: psllq $50, %xmm0
1710 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1711 ; X86-SSE2-NEXT: orpd %xmm2, %xmm0
1712 ; X86-SSE2-NEXT: retl
1713 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1717 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1718 ; SSE2-LABEL: constant_funnnel_v4i32:
1720 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1721 ; SSE2-NEXT: psrld $7, %xmm2
1722 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1723 ; SSE2-NEXT: psrld $6, %xmm3
1724 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1725 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1726 ; SSE2-NEXT: psrld $5, %xmm2
1727 ; SSE2-NEXT: psrld $4, %xmm1
1728 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1729 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1730 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1731 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1732 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1733 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1734 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1735 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1736 ; SSE2-NEXT: por %xmm1, %xmm0
1739 ; SSE41-LABEL: constant_funnnel_v4i32:
1741 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1742 ; SSE41-NEXT: psrld $7, %xmm2
1743 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1744 ; SSE41-NEXT: psrld $5, %xmm3
1745 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1746 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1747 ; SSE41-NEXT: psrld $6, %xmm2
1748 ; SSE41-NEXT: psrld $4, %xmm1
1749 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1750 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1751 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1752 ; SSE41-NEXT: por %xmm2, %xmm0
1755 ; AVX1-LABEL: constant_funnnel_v4i32:
1757 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
1758 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
1759 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1760 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
1761 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
1762 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1763 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1764 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1765 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1768 ; AVX2-LABEL: constant_funnnel_v4i32:
1770 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1771 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1772 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1775 ; AVX512F-LABEL: constant_funnnel_v4i32:
1777 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1778 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1779 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1780 ; AVX512F-NEXT: retq
1782 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1783 ; AVX512VL: # %bb.0:
1784 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1785 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1786 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1787 ; AVX512VL-NEXT: retq
1789 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1790 ; AVX512BW: # %bb.0:
1791 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1792 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1793 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1794 ; AVX512BW-NEXT: retq
1796 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1797 ; AVX512VBMI2: # %bb.0:
1798 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1799 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1800 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
1801 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1802 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1803 ; AVX512VBMI2-NEXT: vzeroupper
1804 ; AVX512VBMI2-NEXT: retq
1806 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1807 ; AVX512VLBW: # %bb.0:
1808 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1809 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1810 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1811 ; AVX512VLBW-NEXT: retq
1813 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1814 ; AVX512VLVBMI2: # %bb.0:
1815 ; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1816 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1817 ; AVX512VLVBMI2-NEXT: retq
1819 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
1821 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1822 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1823 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1824 ; XOPAVX1-NEXT: retq
1826 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
1828 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1829 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1830 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1831 ; XOPAVX2-NEXT: retq
1833 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
1834 ; X86-SSE2: # %bb.0:
1835 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1836 ; X86-SSE2-NEXT: psrld $7, %xmm2
1837 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1838 ; X86-SSE2-NEXT: psrld $6, %xmm3
1839 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1840 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1841 ; X86-SSE2-NEXT: psrld $5, %xmm2
1842 ; X86-SSE2-NEXT: psrld $4, %xmm1
1843 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1844 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1845 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1846 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1847 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1848 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1849 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1850 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1851 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1852 ; X86-SSE2-NEXT: retl
1853 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1857 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1858 ; SSE2-LABEL: constant_funnnel_v8i16:
1860 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1861 ; SSE2-NEXT: pandn %xmm1, %xmm2
1862 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1863 ; SSE2-NEXT: por %xmm1, %xmm2
1864 ; SSE2-NEXT: paddw %xmm0, %xmm0
1865 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1866 ; SSE2-NEXT: por %xmm2, %xmm0
1869 ; SSE41-LABEL: constant_funnnel_v8i16:
1871 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,32768,16384,8192,4096,2048,1024,512]
1872 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
1873 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1874 ; SSE41-NEXT: paddw %xmm0, %xmm0
1875 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1876 ; SSE41-NEXT: por %xmm2, %xmm0
1879 ; AVX-LABEL: constant_funnnel_v8i16:
1881 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1882 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1883 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1884 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1885 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1888 ; AVX512F-LABEL: constant_funnnel_v8i16:
1890 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1891 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1892 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1893 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1894 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1895 ; AVX512F-NEXT: retq
1897 ; AVX512VL-LABEL: constant_funnnel_v8i16:
1898 ; AVX512VL: # %bb.0:
1899 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1900 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1901 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1902 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1903 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1904 ; AVX512VL-NEXT: retq
1906 ; AVX512BW-LABEL: constant_funnnel_v8i16:
1907 ; AVX512BW: # %bb.0:
1908 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1909 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1910 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1911 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1912 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1913 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1914 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1915 ; AVX512BW-NEXT: vzeroupper
1916 ; AVX512BW-NEXT: retq
1918 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1919 ; AVX512VBMI2: # %bb.0:
1920 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1921 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1922 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1923 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1924 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1925 ; AVX512VBMI2-NEXT: vzeroupper
1926 ; AVX512VBMI2-NEXT: retq
1928 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
1929 ; AVX512VLBW: # %bb.0:
1930 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1931 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1932 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1933 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1934 ; AVX512VLBW-NEXT: retq
1936 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
1937 ; AVX512VLVBMI2: # %bb.0:
1938 ; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1939 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1940 ; AVX512VLVBMI2-NEXT: retq
1942 ; XOP-LABEL: constant_funnnel_v8i16:
1944 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1945 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1946 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1947 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1950 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
1951 ; X86-SSE2: # %bb.0:
1952 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1953 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1954 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1955 ; X86-SSE2-NEXT: por %xmm1, %xmm2
1956 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1957 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1958 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1959 ; X86-SSE2-NEXT: retl
1960 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
1964 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
1965 ; SSE2-LABEL: constant_funnnel_v16i8:
1967 ; SSE2-NEXT: pxor %xmm2, %xmm2
1968 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1969 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1970 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1971 ; SSE2-NEXT: psrlw $8, %xmm3
1972 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1973 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1974 ; SSE2-NEXT: psrlw $8, %xmm1
1975 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1976 ; SSE2-NEXT: paddb %xmm0, %xmm0
1977 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1978 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1979 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1980 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1981 ; SSE2-NEXT: pand %xmm3, %xmm2
1982 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1983 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1984 ; SSE2-NEXT: pand %xmm3, %xmm0
1985 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1986 ; SSE2-NEXT: por %xmm1, %xmm0
1989 ; SSE41-LABEL: constant_funnnel_v16i8:
1991 ; SSE41-NEXT: paddb %xmm0, %xmm0
1992 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1993 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1994 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1995 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1996 ; SSE41-NEXT: pand %xmm3, %xmm0
1997 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1998 ; SSE41-NEXT: pand %xmm3, %xmm2
1999 ; SSE41-NEXT: packuswb %xmm0, %xmm2
2000 ; SSE41-NEXT: pxor %xmm3, %xmm3
2001 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2002 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
2003 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2004 ; SSE41-NEXT: psrlw $8, %xmm1
2005 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2006 ; SSE41-NEXT: psrlw $8, %xmm0
2007 ; SSE41-NEXT: packuswb %xmm1, %xmm0
2008 ; SSE41-NEXT: por %xmm2, %xmm0
2011 ; AVX1-LABEL: constant_funnnel_v16i8:
2013 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2014 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2015 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2016 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2017 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2018 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2019 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2020 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2021 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2022 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2023 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2024 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2025 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2026 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2027 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2028 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2029 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2030 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2033 ; AVX2-LABEL: constant_funnnel_v16i8:
2035 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2036 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2037 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2038 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2039 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2040 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2041 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2042 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2043 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2044 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2045 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2046 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2047 ; AVX2-NEXT: vzeroupper
2050 ; AVX512F-LABEL: constant_funnnel_v16i8:
2052 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2053 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2054 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2055 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2056 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2057 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
2058 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2059 ; AVX512F-NEXT: vzeroupper
2060 ; AVX512F-NEXT: retq
2062 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2063 ; AVX512VL: # %bb.0:
2064 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2065 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2066 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2067 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2068 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2069 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
2070 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2071 ; AVX512VL-NEXT: vzeroupper
2072 ; AVX512VL-NEXT: retq
2074 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2075 ; AVX512BW: # %bb.0:
2076 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2077 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2078 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2079 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
2080 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2081 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
2082 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2083 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2084 ; AVX512BW-NEXT: vzeroupper
2085 ; AVX512BW-NEXT: retq
2087 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2088 ; AVX512VBMI2: # %bb.0:
2089 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2090 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2091 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2092 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1
2093 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2094 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
2095 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2096 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2097 ; AVX512VBMI2-NEXT: vzeroupper
2098 ; AVX512VBMI2-NEXT: retq
2100 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2101 ; AVX512VLBW: # %bb.0:
2102 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2103 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2104 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
2105 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2106 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2107 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2108 ; AVX512VLBW-NEXT: vzeroupper
2109 ; AVX512VLBW-NEXT: retq
2111 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2112 ; AVX512VLVBMI2: # %bb.0:
2113 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2114 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2115 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2116 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
2117 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2118 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2119 ; AVX512VLVBMI2-NEXT: vzeroupper
2120 ; AVX512VLVBMI2-NEXT: retq
2122 ; XOP-LABEL: constant_funnnel_v16i8:
2124 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2125 ; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2126 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2127 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2130 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
2131 ; X86-SSE2: # %bb.0:
2132 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
2133 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
2134 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2135 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
2136 ; X86-SSE2-NEXT: psrlw $8, %xmm3
2137 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2138 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2139 ; X86-SSE2-NEXT: psrlw $8, %xmm1
2140 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
2141 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
2142 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
2143 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2144 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
2145 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2146 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
2147 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2148 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2149 ; X86-SSE2-NEXT: pand %xmm3, %xmm0
2150 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
2151 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2152 ; X86-SSE2-NEXT: retl
2153 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2158 ; Uniform Constant Shifts
2161 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2162 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2164 ; SSE-NEXT: psrlq $14, %xmm1
2165 ; SSE-NEXT: psllq $50, %xmm0
2166 ; SSE-NEXT: por %xmm1, %xmm0
2169 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2171 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2172 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2173 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2176 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2178 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2179 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2180 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2181 ; AVX512F-NEXT: retq
2183 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2184 ; AVX512VL: # %bb.0:
2185 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2186 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2187 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2188 ; AVX512VL-NEXT: retq
2190 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2191 ; AVX512BW: # %bb.0:
2192 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2193 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2194 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2195 ; AVX512BW-NEXT: retq
2197 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2198 ; AVX512VBMI2: # %bb.0:
2199 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2200 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2201 ; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
2202 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2203 ; AVX512VBMI2-NEXT: vzeroupper
2204 ; AVX512VBMI2-NEXT: retq
2206 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2207 ; AVX512VLBW: # %bb.0:
2208 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2209 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2210 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2211 ; AVX512VLBW-NEXT: retq
2213 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2214 ; AVX512VLVBMI2: # %bb.0:
2215 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2216 ; AVX512VLVBMI2-NEXT: retq
2218 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2220 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2221 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2222 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2225 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2226 ; X86-SSE2: # %bb.0:
2227 ; X86-SSE2-NEXT: psrlq $14, %xmm1
2228 ; X86-SSE2-NEXT: psllq $50, %xmm0
2229 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2230 ; X86-SSE2-NEXT: retl
2231 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2235 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2236 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2238 ; SSE-NEXT: psrld $4, %xmm1
2239 ; SSE-NEXT: pslld $28, %xmm0
2240 ; SSE-NEXT: por %xmm1, %xmm0
2243 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2245 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2246 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2247 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2250 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2252 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2253 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2254 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2255 ; AVX512F-NEXT: retq
2257 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2258 ; AVX512VL: # %bb.0:
2259 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2260 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2261 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2262 ; AVX512VL-NEXT: retq
2264 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2265 ; AVX512BW: # %bb.0:
2266 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2267 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2268 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2269 ; AVX512BW-NEXT: retq
2271 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2272 ; AVX512VBMI2: # %bb.0:
2273 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2274 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2275 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
2276 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2277 ; AVX512VBMI2-NEXT: vzeroupper
2278 ; AVX512VBMI2-NEXT: retq
2280 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2281 ; AVX512VLBW: # %bb.0:
2282 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2283 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2284 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2285 ; AVX512VLBW-NEXT: retq
2287 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2288 ; AVX512VLVBMI2: # %bb.0:
2289 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2290 ; AVX512VLVBMI2-NEXT: retq
2292 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2294 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2295 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2296 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2299 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2300 ; X86-SSE2: # %bb.0:
2301 ; X86-SSE2-NEXT: psrld $4, %xmm1
2302 ; X86-SSE2-NEXT: pslld $28, %xmm0
2303 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2304 ; X86-SSE2-NEXT: retl
2305 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2309 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2310 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2312 ; SSE-NEXT: psrlw $7, %xmm1
2313 ; SSE-NEXT: psllw $9, %xmm0
2314 ; SSE-NEXT: por %xmm1, %xmm0
2317 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2319 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2320 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2321 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2324 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2326 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2327 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2328 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2329 ; AVX512F-NEXT: retq
2331 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2332 ; AVX512VL: # %bb.0:
2333 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2334 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2335 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2336 ; AVX512VL-NEXT: retq
2338 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2339 ; AVX512BW: # %bb.0:
2340 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2341 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2342 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2343 ; AVX512BW-NEXT: retq
2345 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2346 ; AVX512VBMI2: # %bb.0:
2347 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2348 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2349 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
2350 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2351 ; AVX512VBMI2-NEXT: vzeroupper
2352 ; AVX512VBMI2-NEXT: retq
2354 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2355 ; AVX512VLBW: # %bb.0:
2356 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2357 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2358 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2359 ; AVX512VLBW-NEXT: retq
2361 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2362 ; AVX512VLVBMI2: # %bb.0:
2363 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
2364 ; AVX512VLVBMI2-NEXT: retq
2366 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2368 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2369 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2370 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2373 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2374 ; X86-SSE2: # %bb.0:
2375 ; X86-SSE2-NEXT: psrlw $7, %xmm1
2376 ; X86-SSE2-NEXT: psllw $9, %xmm0
2377 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2378 ; X86-SSE2-NEXT: retl
2379 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2383 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2384 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2386 ; SSE-NEXT: psrlw $4, %xmm1
2387 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2388 ; SSE-NEXT: psllw $4, %xmm0
2389 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2390 ; SSE-NEXT: por %xmm1, %xmm0
2393 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2395 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2396 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2397 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2398 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2399 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2402 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2404 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2
2405 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0
2406 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2407 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2408 ; AVX512F-NEXT: vzeroupper
2409 ; AVX512F-NEXT: retq
2411 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2412 ; AVX512VL: # %bb.0:
2413 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2414 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2415 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2416 ; AVX512VL-NEXT: retq
2418 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2419 ; AVX512BW: # %bb.0:
2420 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2
2421 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0
2422 ; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2423 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2424 ; AVX512BW-NEXT: vzeroupper
2425 ; AVX512BW-NEXT: retq
2427 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2428 ; AVX512VBMI2: # %bb.0:
2429 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2430 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2431 ; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2432 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2433 ; AVX512VBMI2-NEXT: vzeroupper
2434 ; AVX512VBMI2-NEXT: retq
2436 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2437 ; AVX512VLBW: # %bb.0:
2438 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
2439 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
2440 ; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2441 ; AVX512VLBW-NEXT: retq
2443 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2444 ; AVX512VLVBMI2: # %bb.0:
2445 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2446 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2447 ; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2448 ; AVX512VLVBMI2-NEXT: retq
2450 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2452 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2453 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2454 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2457 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2458 ; X86-SSE2: # %bb.0:
2459 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2460 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2461 ; X86-SSE2-NEXT: psllw $4, %xmm0
2462 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2463 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2464 ; X86-SSE2-NEXT: retl
2465 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)