1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
17 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
18 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
20 declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
21 declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
22 declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
23 declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
29 define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
30 ; SSE2-LABEL: var_funnnel_v2i64:
32 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
33 ; SSE2-NEXT: movdqa %xmm2, %xmm4
34 ; SSE2-NEXT: pand %xmm3, %xmm4
35 ; SSE2-NEXT: movdqa %xmm1, %xmm5
36 ; SSE2-NEXT: psrlq %xmm4, %xmm5
37 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
38 ; SSE2-NEXT: psrlq %xmm4, %xmm1
39 ; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
40 ; SSE2-NEXT: pandn %xmm3, %xmm2
41 ; SSE2-NEXT: paddq %xmm0, %xmm0
42 ; SSE2-NEXT: movdqa %xmm0, %xmm1
43 ; SSE2-NEXT: psllq %xmm2, %xmm1
44 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
45 ; SSE2-NEXT: psllq %xmm2, %xmm0
46 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
47 ; SSE2-NEXT: orpd %xmm5, %xmm0
50 ; SSE41-LABEL: var_funnnel_v2i64:
52 ; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63]
53 ; SSE41-NEXT: movdqa %xmm2, %xmm4
54 ; SSE41-NEXT: pand %xmm3, %xmm4
55 ; SSE41-NEXT: movdqa %xmm1, %xmm5
56 ; SSE41-NEXT: psrlq %xmm4, %xmm5
57 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
58 ; SSE41-NEXT: psrlq %xmm4, %xmm1
59 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
60 ; SSE41-NEXT: pandn %xmm3, %xmm2
61 ; SSE41-NEXT: paddq %xmm0, %xmm0
62 ; SSE41-NEXT: movdqa %xmm0, %xmm1
63 ; SSE41-NEXT: psllq %xmm2, %xmm1
64 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
65 ; SSE41-NEXT: psllq %xmm2, %xmm0
66 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
67 ; SSE41-NEXT: por %xmm5, %xmm0
70 ; AVX1-LABEL: var_funnnel_v2i64:
72 ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
73 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
74 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
75 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
76 ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
77 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
78 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
79 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
80 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
81 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
82 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
83 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
84 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
87 ; AVX2-LABEL: var_funnnel_v2i64:
89 ; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
90 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
91 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
92 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
93 ; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
94 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
95 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
98 ; AVX512F-LABEL: var_funnnel_v2i64:
100 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
101 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
102 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
103 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
104 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
105 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
106 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
109 ; AVX512VL-LABEL: var_funnnel_v2i64:
111 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
112 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
113 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
114 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
115 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
116 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
117 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
118 ; AVX512VL-NEXT: retq
120 ; AVX512BW-LABEL: var_funnnel_v2i64:
122 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
123 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
124 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
125 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
126 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
127 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
128 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
129 ; AVX512BW-NEXT: retq
131 ; AVX512VBMI2-LABEL: var_funnnel_v2i64:
132 ; AVX512VBMI2: # %bb.0:
133 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
134 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
135 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
136 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
137 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
138 ; AVX512VBMI2-NEXT: vzeroupper
139 ; AVX512VBMI2-NEXT: retq
141 ; AVX512VLBW-LABEL: var_funnnel_v2i64:
142 ; AVX512VLBW: # %bb.0:
143 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
144 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
145 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
146 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
147 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
148 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
149 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
150 ; AVX512VLBW-NEXT: retq
152 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
153 ; AVX512VLVBMI2: # %bb.0:
154 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
155 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
156 ; AVX512VLVBMI2-NEXT: retq
158 ; XOPAVX1-LABEL: var_funnnel_v2i64:
160 ; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
161 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
162 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
163 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
164 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
165 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
166 ; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
167 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
168 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
171 ; XOPAVX2-LABEL: var_funnnel_v2i64:
173 ; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
174 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
175 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
176 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
177 ; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
178 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
179 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
182 ; X86-SSE2-LABEL: var_funnnel_v2i64:
184 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
185 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
186 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
187 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
188 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm3
189 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
190 ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
191 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
192 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
193 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
194 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
195 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
196 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
197 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
198 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
199 ; X86-SSE2-NEXT: orpd %xmm3, %xmm0
200 ; X86-SSE2-NEXT: retl
201 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
205 define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
206 ; SSE2-LABEL: var_funnnel_v4i32:
208 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
209 ; SSE2-NEXT: movdqa %xmm2, %xmm5
210 ; SSE2-NEXT: pand %xmm4, %xmm5
211 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
212 ; SSE2-NEXT: movdqa %xmm1, %xmm6
213 ; SSE2-NEXT: psrld %xmm3, %xmm6
214 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
215 ; SSE2-NEXT: movdqa %xmm1, %xmm3
216 ; SSE2-NEXT: psrld %xmm7, %xmm3
217 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
218 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
219 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
220 ; SSE2-NEXT: movdqa %xmm1, %xmm7
221 ; SSE2-NEXT: psrld %xmm6, %xmm7
222 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
223 ; SSE2-NEXT: psrld %xmm5, %xmm1
224 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
225 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
226 ; SSE2-NEXT: pandn %xmm4, %xmm2
227 ; SSE2-NEXT: pslld $23, %xmm2
228 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
229 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
230 ; SSE2-NEXT: paddd %xmm0, %xmm0
231 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
232 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
233 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
234 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
235 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
236 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
237 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238 ; SSE2-NEXT: por %xmm3, %xmm0
241 ; SSE41-LABEL: var_funnnel_v4i32:
243 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
244 ; SSE41-NEXT: movdqa %xmm2, %xmm4
245 ; SSE41-NEXT: pand %xmm3, %xmm4
246 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
247 ; SSE41-NEXT: movdqa %xmm1, %xmm6
248 ; SSE41-NEXT: psrld %xmm5, %xmm6
249 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
250 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
251 ; SSE41-NEXT: movdqa %xmm1, %xmm8
252 ; SSE41-NEXT: psrld %xmm7, %xmm8
253 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
254 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
255 ; SSE41-NEXT: movdqa %xmm1, %xmm6
256 ; SSE41-NEXT: psrld %xmm4, %xmm6
257 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
258 ; SSE41-NEXT: psrld %xmm4, %xmm1
259 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
260 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
261 ; SSE41-NEXT: pandn %xmm3, %xmm2
262 ; SSE41-NEXT: pslld $23, %xmm2
263 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
264 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
265 ; SSE41-NEXT: paddd %xmm0, %xmm0
266 ; SSE41-NEXT: pmulld %xmm1, %xmm0
267 ; SSE41-NEXT: por %xmm6, %xmm0
270 ; AVX1-LABEL: var_funnnel_v4i32:
272 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
273 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
274 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
275 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
276 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
277 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
278 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
279 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
280 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
281 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
282 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
283 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
284 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
285 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
286 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
287 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
288 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
289 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
290 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
291 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
292 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
295 ; AVX2-LABEL: var_funnnel_v4i32:
297 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
298 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
299 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
300 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
301 ; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
302 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
303 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
306 ; AVX512F-LABEL: var_funnnel_v4i32:
308 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
309 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
310 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
311 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
312 ; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
313 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
314 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
317 ; AVX512VL-LABEL: var_funnnel_v4i32:
319 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
320 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
321 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
322 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
323 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0
324 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
325 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
326 ; AVX512VL-NEXT: retq
328 ; AVX512BW-LABEL: var_funnnel_v4i32:
330 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
331 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
332 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
333 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
334 ; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
335 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
336 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
337 ; AVX512BW-NEXT: retq
339 ; AVX512VBMI2-LABEL: var_funnnel_v4i32:
340 ; AVX512VBMI2: # %bb.0:
341 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
342 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
343 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
344 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
345 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
346 ; AVX512VBMI2-NEXT: vzeroupper
347 ; AVX512VBMI2-NEXT: retq
349 ; AVX512VLBW-LABEL: var_funnnel_v4i32:
350 ; AVX512VLBW: # %bb.0:
351 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
352 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
353 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
354 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
355 ; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
356 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
357 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
358 ; AVX512VLBW-NEXT: retq
360 ; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
361 ; AVX512VLVBMI2: # %bb.0:
362 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
363 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
364 ; AVX512VLVBMI2-NEXT: retq
366 ; XOPAVX1-LABEL: var_funnnel_v4i32:
368 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
369 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
370 ; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
371 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
372 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
373 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
374 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
375 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
376 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
379 ; XOPAVX2-LABEL: var_funnnel_v4i32:
381 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
382 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
383 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
384 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
385 ; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
386 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
387 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
390 ; X86-SSE2-LABEL: var_funnnel_v4i32:
392 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
393 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
394 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
395 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
396 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
397 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
398 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
399 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
400 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
401 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
402 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
403 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
404 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
405 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
406 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
407 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
408 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
409 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
410 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
411 ; X86-SSE2-NEXT: pslld $23, %xmm2
412 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
413 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
414 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0
415 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
416 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
417 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
418 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
419 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
420 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
421 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
422 ; X86-SSE2-NEXT: por %xmm3, %xmm0
423 ; X86-SSE2-NEXT: retl
424 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
428 define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
429 ; SSE2-LABEL: var_funnnel_v8i16:
431 ; SSE2-NEXT: movdqa %xmm2, %xmm4
432 ; SSE2-NEXT: psllw $12, %xmm4
433 ; SSE2-NEXT: movdqa %xmm4, %xmm3
434 ; SSE2-NEXT: psraw $15, %xmm3
435 ; SSE2-NEXT: movdqa %xmm3, %xmm5
436 ; SSE2-NEXT: pandn %xmm1, %xmm5
437 ; SSE2-NEXT: psrlw $8, %xmm1
438 ; SSE2-NEXT: pand %xmm1, %xmm3
439 ; SSE2-NEXT: por %xmm5, %xmm3
440 ; SSE2-NEXT: paddw %xmm4, %xmm4
441 ; SSE2-NEXT: movdqa %xmm4, %xmm1
442 ; SSE2-NEXT: psraw $15, %xmm1
443 ; SSE2-NEXT: movdqa %xmm1, %xmm5
444 ; SSE2-NEXT: pandn %xmm3, %xmm5
445 ; SSE2-NEXT: psrlw $4, %xmm3
446 ; SSE2-NEXT: pand %xmm1, %xmm3
447 ; SSE2-NEXT: por %xmm5, %xmm3
448 ; SSE2-NEXT: paddw %xmm4, %xmm4
449 ; SSE2-NEXT: movdqa %xmm4, %xmm1
450 ; SSE2-NEXT: psraw $15, %xmm1
451 ; SSE2-NEXT: movdqa %xmm1, %xmm5
452 ; SSE2-NEXT: pandn %xmm3, %xmm5
453 ; SSE2-NEXT: psrlw $2, %xmm3
454 ; SSE2-NEXT: pand %xmm1, %xmm3
455 ; SSE2-NEXT: por %xmm5, %xmm3
456 ; SSE2-NEXT: paddw %xmm4, %xmm4
457 ; SSE2-NEXT: psraw $15, %xmm4
458 ; SSE2-NEXT: movdqa %xmm4, %xmm1
459 ; SSE2-NEXT: pandn %xmm3, %xmm1
460 ; SSE2-NEXT: psrlw $1, %xmm3
461 ; SSE2-NEXT: pand %xmm4, %xmm3
462 ; SSE2-NEXT: por %xmm1, %xmm3
463 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
464 ; SSE2-NEXT: movdqa %xmm2, %xmm1
465 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
466 ; SSE2-NEXT: pslld $23, %xmm1
467 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
468 ; SSE2-NEXT: paddd %xmm4, %xmm1
469 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
470 ; SSE2-NEXT: pslld $16, %xmm1
471 ; SSE2-NEXT: psrad $16, %xmm1
472 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
473 ; SSE2-NEXT: pslld $23, %xmm2
474 ; SSE2-NEXT: paddd %xmm4, %xmm2
475 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
476 ; SSE2-NEXT: pslld $16, %xmm2
477 ; SSE2-NEXT: psrad $16, %xmm2
478 ; SSE2-NEXT: packssdw %xmm1, %xmm2
479 ; SSE2-NEXT: paddw %xmm0, %xmm0
480 ; SSE2-NEXT: pmullw %xmm2, %xmm0
481 ; SSE2-NEXT: por %xmm3, %xmm0
484 ; SSE41-LABEL: var_funnnel_v8i16:
486 ; SSE41-NEXT: movdqa %xmm0, %xmm3
487 ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
488 ; SSE41-NEXT: movdqa %xmm2, %xmm0
489 ; SSE41-NEXT: pand %xmm5, %xmm0
490 ; SSE41-NEXT: movdqa %xmm0, %xmm4
491 ; SSE41-NEXT: psllw $12, %xmm4
492 ; SSE41-NEXT: psllw $4, %xmm0
493 ; SSE41-NEXT: por %xmm4, %xmm0
494 ; SSE41-NEXT: movdqa %xmm0, %xmm4
495 ; SSE41-NEXT: paddw %xmm0, %xmm4
496 ; SSE41-NEXT: movdqa %xmm1, %xmm6
497 ; SSE41-NEXT: psrlw $8, %xmm6
498 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
499 ; SSE41-NEXT: movdqa %xmm1, %xmm6
500 ; SSE41-NEXT: psrlw $4, %xmm6
501 ; SSE41-NEXT: movdqa %xmm4, %xmm0
502 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
503 ; SSE41-NEXT: movdqa %xmm1, %xmm6
504 ; SSE41-NEXT: psrlw $2, %xmm6
505 ; SSE41-NEXT: paddw %xmm4, %xmm4
506 ; SSE41-NEXT: movdqa %xmm4, %xmm0
507 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
508 ; SSE41-NEXT: movdqa %xmm1, %xmm6
509 ; SSE41-NEXT: psrlw $1, %xmm6
510 ; SSE41-NEXT: paddw %xmm4, %xmm4
511 ; SSE41-NEXT: movdqa %xmm4, %xmm0
512 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
513 ; SSE41-NEXT: pandn %xmm5, %xmm2
514 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
515 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
516 ; SSE41-NEXT: pslld $23, %xmm2
517 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
518 ; SSE41-NEXT: paddd %xmm4, %xmm2
519 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
520 ; SSE41-NEXT: pslld $23, %xmm0
521 ; SSE41-NEXT: paddd %xmm4, %xmm0
522 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
523 ; SSE41-NEXT: packusdw %xmm2, %xmm0
524 ; SSE41-NEXT: paddw %xmm3, %xmm3
525 ; SSE41-NEXT: pmullw %xmm0, %xmm3
526 ; SSE41-NEXT: por %xmm1, %xmm3
527 ; SSE41-NEXT: movdqa %xmm3, %xmm0
530 ; AVX1-LABEL: var_funnnel_v8i16:
532 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
533 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
534 ; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5
535 ; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4
536 ; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4
537 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5
538 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm6
539 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
540 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
541 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
542 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
543 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
544 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
545 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
546 ; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5
547 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
548 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
549 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
550 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
551 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
552 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
553 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
554 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
555 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
556 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
557 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
558 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
559 ; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
560 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
561 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
564 ; AVX2-LABEL: var_funnnel_v8i16:
566 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
567 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
568 ; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
569 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
570 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
571 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
572 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
573 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
574 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
575 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
576 ; AVX2-NEXT: vzeroupper
579 ; AVX512F-LABEL: var_funnnel_v8i16:
581 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
582 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
583 ; AVX512F-NEXT: vpslld $16, %ymm0, %ymm0
584 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
585 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
586 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
587 ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
588 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
589 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
590 ; AVX512F-NEXT: vzeroupper
593 ; AVX512VL-LABEL: var_funnnel_v8i16:
595 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
596 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
597 ; AVX512VL-NEXT: vpslld $16, %ymm0, %ymm0
598 ; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
599 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
600 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
601 ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
602 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
603 ; AVX512VL-NEXT: vzeroupper
604 ; AVX512VL-NEXT: retq
606 ; AVX512BW-LABEL: var_funnnel_v8i16:
608 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
609 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
610 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
611 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
612 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
613 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
614 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
615 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
616 ; AVX512BW-NEXT: vzeroupper
617 ; AVX512BW-NEXT: retq
619 ; AVX512VBMI2-LABEL: var_funnnel_v8i16:
620 ; AVX512VBMI2: # %bb.0:
621 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
622 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
623 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
624 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
625 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
626 ; AVX512VBMI2-NEXT: vzeroupper
627 ; AVX512VBMI2-NEXT: retq
629 ; AVX512VLBW-LABEL: var_funnnel_v8i16:
630 ; AVX512VLBW: # %bb.0:
631 ; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
632 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
633 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
634 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
635 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
636 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
637 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
638 ; AVX512VLBW-NEXT: retq
640 ; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
641 ; AVX512VLVBMI2: # %bb.0:
642 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
643 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
644 ; AVX512VLVBMI2-NEXT: retq
646 ; XOPAVX1-LABEL: var_funnnel_v8i16:
648 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
649 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
650 ; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
651 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0
652 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
653 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
654 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
655 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1
656 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
659 ; XOPAVX2-LABEL: var_funnnel_v8i16:
661 ; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
662 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
663 ; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0
664 ; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
665 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
666 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
667 ; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
668 ; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
669 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
672 ; X86-SSE2-LABEL: var_funnnel_v8i16:
674 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
675 ; X86-SSE2-NEXT: psllw $12, %xmm4
676 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
677 ; X86-SSE2-NEXT: psraw $15, %xmm3
678 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
679 ; X86-SSE2-NEXT: pandn %xmm1, %xmm5
680 ; X86-SSE2-NEXT: psrlw $8, %xmm1
681 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
682 ; X86-SSE2-NEXT: por %xmm5, %xmm3
683 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
684 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
685 ; X86-SSE2-NEXT: psraw $15, %xmm1
686 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
687 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
688 ; X86-SSE2-NEXT: psrlw $4, %xmm3
689 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
690 ; X86-SSE2-NEXT: por %xmm5, %xmm3
691 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
692 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
693 ; X86-SSE2-NEXT: psraw $15, %xmm1
694 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
695 ; X86-SSE2-NEXT: pandn %xmm3, %xmm5
696 ; X86-SSE2-NEXT: psrlw $2, %xmm3
697 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
698 ; X86-SSE2-NEXT: por %xmm5, %xmm3
699 ; X86-SSE2-NEXT: paddw %xmm4, %xmm4
700 ; X86-SSE2-NEXT: psraw $15, %xmm4
701 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
702 ; X86-SSE2-NEXT: pandn %xmm3, %xmm1
703 ; X86-SSE2-NEXT: psrlw $1, %xmm3
704 ; X86-SSE2-NEXT: pand %xmm4, %xmm3
705 ; X86-SSE2-NEXT: por %xmm1, %xmm3
706 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
707 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
708 ; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
709 ; X86-SSE2-NEXT: pslld $23, %xmm1
710 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
711 ; X86-SSE2-NEXT: paddd %xmm4, %xmm1
712 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
713 ; X86-SSE2-NEXT: pslld $16, %xmm1
714 ; X86-SSE2-NEXT: psrad $16, %xmm1
715 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
716 ; X86-SSE2-NEXT: pslld $23, %xmm2
717 ; X86-SSE2-NEXT: paddd %xmm4, %xmm2
718 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
719 ; X86-SSE2-NEXT: pslld $16, %xmm2
720 ; X86-SSE2-NEXT: psrad $16, %xmm2
721 ; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
722 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
723 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
724 ; X86-SSE2-NEXT: por %xmm3, %xmm0
725 ; X86-SSE2-NEXT: retl
726 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
730 define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
731 ; SSE2-LABEL: var_funnnel_v16i8:
733 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
734 ; SSE2-NEXT: movdqa %xmm2, %xmm6
735 ; SSE2-NEXT: pand %xmm5, %xmm6
736 ; SSE2-NEXT: psllw $5, %xmm6
737 ; SSE2-NEXT: pxor %xmm4, %xmm4
738 ; SSE2-NEXT: pxor %xmm3, %xmm3
739 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm3
740 ; SSE2-NEXT: movdqa %xmm3, %xmm7
741 ; SSE2-NEXT: pandn %xmm1, %xmm7
742 ; SSE2-NEXT: psrlw $4, %xmm1
743 ; SSE2-NEXT: pand %xmm1, %xmm3
744 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
745 ; SSE2-NEXT: por %xmm7, %xmm3
746 ; SSE2-NEXT: paddb %xmm6, %xmm6
747 ; SSE2-NEXT: pxor %xmm1, %xmm1
748 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
749 ; SSE2-NEXT: movdqa %xmm1, %xmm7
750 ; SSE2-NEXT: pandn %xmm3, %xmm7
751 ; SSE2-NEXT: psrlw $2, %xmm3
752 ; SSE2-NEXT: pand %xmm1, %xmm3
753 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
754 ; SSE2-NEXT: por %xmm7, %xmm3
755 ; SSE2-NEXT: paddb %xmm6, %xmm6
756 ; SSE2-NEXT: pxor %xmm1, %xmm1
757 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
758 ; SSE2-NEXT: movdqa %xmm1, %xmm6
759 ; SSE2-NEXT: pandn %xmm3, %xmm6
760 ; SSE2-NEXT: psrlw $1, %xmm3
761 ; SSE2-NEXT: pand %xmm1, %xmm3
762 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
763 ; SSE2-NEXT: por %xmm6, %xmm3
764 ; SSE2-NEXT: pandn %xmm5, %xmm2
765 ; SSE2-NEXT: psllw $5, %xmm2
766 ; SSE2-NEXT: pxor %xmm1, %xmm1
767 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
768 ; SSE2-NEXT: paddb %xmm0, %xmm0
769 ; SSE2-NEXT: movdqa %xmm1, %xmm5
770 ; SSE2-NEXT: pandn %xmm0, %xmm5
771 ; SSE2-NEXT: psllw $4, %xmm0
772 ; SSE2-NEXT: pand %xmm1, %xmm0
773 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
774 ; SSE2-NEXT: por %xmm5, %xmm0
775 ; SSE2-NEXT: paddb %xmm2, %xmm2
776 ; SSE2-NEXT: pxor %xmm1, %xmm1
777 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
778 ; SSE2-NEXT: movdqa %xmm1, %xmm5
779 ; SSE2-NEXT: pandn %xmm0, %xmm5
780 ; SSE2-NEXT: psllw $2, %xmm0
781 ; SSE2-NEXT: pand %xmm1, %xmm0
782 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
783 ; SSE2-NEXT: por %xmm5, %xmm0
784 ; SSE2-NEXT: paddb %xmm2, %xmm2
785 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
786 ; SSE2-NEXT: movdqa %xmm4, %xmm1
787 ; SSE2-NEXT: pandn %xmm0, %xmm1
788 ; SSE2-NEXT: paddb %xmm0, %xmm0
789 ; SSE2-NEXT: pand %xmm4, %xmm0
790 ; SSE2-NEXT: por %xmm1, %xmm0
791 ; SSE2-NEXT: por %xmm3, %xmm0
794 ; SSE41-LABEL: var_funnnel_v16i8:
796 ; SSE41-NEXT: movdqa %xmm2, %xmm3
797 ; SSE41-NEXT: movdqa %xmm0, %xmm2
798 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
799 ; SSE41-NEXT: movdqa %xmm3, %xmm0
800 ; SSE41-NEXT: pand %xmm5, %xmm0
801 ; SSE41-NEXT: psllw $5, %xmm0
802 ; SSE41-NEXT: movdqa %xmm0, %xmm4
803 ; SSE41-NEXT: paddb %xmm0, %xmm4
804 ; SSE41-NEXT: movdqa %xmm1, %xmm6
805 ; SSE41-NEXT: psrlw $4, %xmm6
806 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
807 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
808 ; SSE41-NEXT: movdqa %xmm1, %xmm6
809 ; SSE41-NEXT: psrlw $2, %xmm6
810 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
811 ; SSE41-NEXT: movdqa %xmm4, %xmm0
812 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
813 ; SSE41-NEXT: movdqa %xmm1, %xmm6
814 ; SSE41-NEXT: psrlw $1, %xmm6
815 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
816 ; SSE41-NEXT: paddb %xmm4, %xmm4
817 ; SSE41-NEXT: movdqa %xmm4, %xmm0
818 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
819 ; SSE41-NEXT: pandn %xmm5, %xmm3
820 ; SSE41-NEXT: psllw $5, %xmm3
821 ; SSE41-NEXT: movdqa %xmm3, %xmm4
822 ; SSE41-NEXT: paddb %xmm3, %xmm4
823 ; SSE41-NEXT: paddb %xmm2, %xmm2
824 ; SSE41-NEXT: movdqa %xmm2, %xmm5
825 ; SSE41-NEXT: psllw $4, %xmm5
826 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
827 ; SSE41-NEXT: movdqa %xmm3, %xmm0
828 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
829 ; SSE41-NEXT: movdqa %xmm2, %xmm3
830 ; SSE41-NEXT: psllw $2, %xmm3
831 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
832 ; SSE41-NEXT: movdqa %xmm4, %xmm0
833 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
834 ; SSE41-NEXT: movdqa %xmm2, %xmm3
835 ; SSE41-NEXT: paddb %xmm2, %xmm3
836 ; SSE41-NEXT: paddb %xmm4, %xmm4
837 ; SSE41-NEXT: movdqa %xmm4, %xmm0
838 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
839 ; SSE41-NEXT: por %xmm1, %xmm2
840 ; SSE41-NEXT: movdqa %xmm2, %xmm0
843 ; AVX1-LABEL: var_funnnel_v16i8:
845 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
846 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
847 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
848 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
849 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
850 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
851 ; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
852 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
853 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
854 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
855 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
856 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
857 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
858 ; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
859 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
860 ; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
861 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
862 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
863 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4
864 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
865 ; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
866 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2
867 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
868 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
869 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
870 ; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
871 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
872 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
875 ; AVX2-LABEL: var_funnnel_v16i8:
877 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
878 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
879 ; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4
880 ; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
881 ; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
882 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
883 ; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
884 ; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
885 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
886 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
887 ; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
888 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
889 ; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
890 ; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
891 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
892 ; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2
893 ; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
894 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
895 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4
896 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
897 ; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
898 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
899 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
900 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
901 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
902 ; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
903 ; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
904 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
907 ; AVX512F-LABEL: var_funnnel_v16i8:
909 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
910 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
911 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
912 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
913 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
914 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
915 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
916 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
917 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
918 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
919 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
920 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
921 ; AVX512F-NEXT: vzeroupper
924 ; AVX512VL-LABEL: var_funnnel_v16i8:
926 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
927 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
928 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
929 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
930 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
931 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
932 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
933 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
934 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
935 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
936 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
937 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
938 ; AVX512VL-NEXT: vzeroupper
939 ; AVX512VL-NEXT: retq
941 ; AVX512BW-LABEL: var_funnnel_v16i8:
943 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
944 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
945 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
946 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
947 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
948 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
949 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
950 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
951 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
952 ; AVX512BW-NEXT: vzeroupper
953 ; AVX512BW-NEXT: retq
955 ; AVX512VBMI2-LABEL: var_funnnel_v16i8:
956 ; AVX512VBMI2: # %bb.0:
957 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
958 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
959 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
960 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm3, %zmm1
961 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
962 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
963 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
964 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
965 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
966 ; AVX512VBMI2-NEXT: vzeroupper
967 ; AVX512VBMI2-NEXT: retq
969 ; AVX512VLBW-LABEL: var_funnnel_v16i8:
970 ; AVX512VLBW: # %bb.0:
971 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
972 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
973 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
974 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
975 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
976 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
977 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
978 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
979 ; AVX512VLBW-NEXT: vzeroupper
980 ; AVX512VLBW-NEXT: retq
982 ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
983 ; AVX512VLVBMI2: # %bb.0:
984 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
985 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
986 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
987 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm3
988 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
989 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
990 ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm0, %ymm3, %ymm0
991 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
992 ; AVX512VLVBMI2-NEXT: vzeroupper
993 ; AVX512VLVBMI2-NEXT: retq
995 ; XOPAVX1-LABEL: var_funnnel_v16i8:
997 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
998 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
999 ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1000 ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm0, %xmm0
1001 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1002 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1003 ; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
1004 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1005 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1006 ; XOPAVX1-NEXT: retq
1008 ; XOPAVX2-LABEL: var_funnnel_v16i8:
1010 ; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1011 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
1012 ; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
1013 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0
1014 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1015 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1016 ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
1017 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
1018 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1019 ; XOPAVX2-NEXT: retq
1021 ; X86-SSE2-LABEL: var_funnnel_v16i8:
1022 ; X86-SSE2: # %bb.0:
1023 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1024 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
1025 ; X86-SSE2-NEXT: pand %xmm5, %xmm6
1026 ; X86-SSE2-NEXT: psllw $5, %xmm6
1027 ; X86-SSE2-NEXT: pxor %xmm4, %xmm4
1028 ; X86-SSE2-NEXT: pxor %xmm3, %xmm3
1029 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
1030 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
1031 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7
1032 ; X86-SSE2-NEXT: psrlw $4, %xmm1
1033 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1034 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1035 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1036 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1037 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1038 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1039 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
1040 ; X86-SSE2-NEXT: pandn %xmm3, %xmm7
1041 ; X86-SSE2-NEXT: psrlw $2, %xmm3
1042 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1043 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1044 ; X86-SSE2-NEXT: por %xmm7, %xmm3
1045 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6
1046 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1047 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
1048 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
1049 ; X86-SSE2-NEXT: pandn %xmm3, %xmm6
1050 ; X86-SSE2-NEXT: psrlw $1, %xmm3
1051 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1052 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1053 ; X86-SSE2-NEXT: por %xmm6, %xmm3
1054 ; X86-SSE2-NEXT: pandn %xmm5, %xmm2
1055 ; X86-SSE2-NEXT: psllw $5, %xmm2
1056 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1057 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1058 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1059 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1060 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1061 ; X86-SSE2-NEXT: psllw $4, %xmm0
1062 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1063 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1064 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1065 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1066 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
1067 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
1068 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
1069 ; X86-SSE2-NEXT: pandn %xmm0, %xmm5
1070 ; X86-SSE2-NEXT: psllw $2, %xmm0
1071 ; X86-SSE2-NEXT: pand %xmm1, %xmm0
1072 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1073 ; X86-SSE2-NEXT: por %xmm5, %xmm0
1074 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2
1075 ; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
1076 ; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
1077 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1
1078 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
1079 ; X86-SSE2-NEXT: pand %xmm4, %xmm0
1080 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1081 ; X86-SSE2-NEXT: por %xmm3, %xmm0
1082 ; X86-SSE2-NEXT: retl
1083 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1088 ; Uniform Variable Shifts
1091 define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1092 ; SSE2-LABEL: splatvar_funnnel_v2i64:
1094 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,63]
1095 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1096 ; SSE2-NEXT: pand %xmm3, %xmm4
1097 ; SSE2-NEXT: psrlq %xmm4, %xmm1
1098 ; SSE2-NEXT: pandn %xmm3, %xmm2
1099 ; SSE2-NEXT: paddq %xmm0, %xmm0
1100 ; SSE2-NEXT: psllq %xmm2, %xmm0
1101 ; SSE2-NEXT: por %xmm1, %xmm0
1104 ; SSE41-LABEL: splatvar_funnnel_v2i64:
1106 ; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [63,63]
1107 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1108 ; SSE41-NEXT: pand %xmm3, %xmm4
1109 ; SSE41-NEXT: psrlq %xmm4, %xmm1
1110 ; SSE41-NEXT: pandn %xmm3, %xmm2
1111 ; SSE41-NEXT: paddq %xmm0, %xmm0
1112 ; SSE41-NEXT: psllq %xmm2, %xmm0
1113 ; SSE41-NEXT: por %xmm1, %xmm0
1116 ; AVX-LABEL: splatvar_funnnel_v2i64:
1118 ; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
1119 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1120 ; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1121 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1122 ; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1123 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1124 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1127 ; AVX512F-LABEL: splatvar_funnnel_v2i64:
1129 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
1130 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1131 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1132 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1133 ; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1134 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1135 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1136 ; AVX512F-NEXT: retq
1138 ; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1139 ; AVX512VL: # %bb.0:
1140 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1141 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1142 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1143 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1144 ; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1145 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1146 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1147 ; AVX512VL-NEXT: retq
1149 ; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1150 ; AVX512BW: # %bb.0:
1151 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
1152 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1153 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1154 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1155 ; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1156 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1157 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1158 ; AVX512BW-NEXT: retq
1160 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1161 ; AVX512VBMI2: # %bb.0:
1162 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1163 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1164 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1165 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1166 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1167 ; AVX512VBMI2-NEXT: vzeroupper
1168 ; AVX512VBMI2-NEXT: retq
1170 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1171 ; AVX512VLBW: # %bb.0:
1172 ; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
1173 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1174 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1175 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1176 ; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1177 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1178 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1179 ; AVX512VLBW-NEXT: retq
1181 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1182 ; AVX512VLVBMI2: # %bb.0:
1183 ; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1184 ; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1185 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1186 ; AVX512VLVBMI2-NEXT: retq
1188 ; XOP-LABEL: splatvar_funnnel_v2i64:
1190 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
1191 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1192 ; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
1193 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1194 ; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0
1195 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
1196 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1199 ; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1200 ; X86-SSE2: # %bb.0:
1201 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
1202 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1203 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1204 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
1205 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1206 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0
1207 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0
1208 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1209 ; X86-SSE2-NEXT: retl
1210 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1211 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1215 define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1216 ; SSE-LABEL: splatvar_funnnel_v4i32:
1218 ; SSE-NEXT: movdqa %xmm1, %xmm3
1219 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1220 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1221 ; SSE-NEXT: psrlq %xmm2, %xmm3
1222 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1223 ; SSE-NEXT: psrlq %xmm2, %xmm1
1224 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1225 ; SSE-NEXT: movaps %xmm1, %xmm0
1228 ; AVX-LABEL: splatvar_funnnel_v4i32:
1230 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1231 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1232 ; AVX-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1233 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1234 ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1235 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1238 ; AVX512F-LABEL: splatvar_funnnel_v4i32:
1240 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1241 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1242 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1243 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1244 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1245 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1246 ; AVX512F-NEXT: retq
1248 ; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1249 ; AVX512VL: # %bb.0:
1250 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1251 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1252 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1253 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1254 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1255 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
1256 ; AVX512VL-NEXT: vzeroupper
1257 ; AVX512VL-NEXT: retq
1259 ; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1260 ; AVX512BW: # %bb.0:
1261 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1262 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1263 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1264 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1265 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1266 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1267 ; AVX512BW-NEXT: retq
1269 ; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1270 ; AVX512VBMI2: # %bb.0:
1271 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1272 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1273 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1274 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1275 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1276 ; AVX512VBMI2-NEXT: vzeroupper
1277 ; AVX512VBMI2-NEXT: retq
1279 ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1280 ; AVX512VLBW: # %bb.0:
1281 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1282 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1283 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1284 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1285 ; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
1286 ; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0
1287 ; AVX512VLBW-NEXT: vzeroupper
1288 ; AVX512VLBW-NEXT: retq
1290 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1291 ; AVX512VLVBMI2: # %bb.0:
1292 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1293 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1294 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1295 ; AVX512VLVBMI2-NEXT: retq
1297 ; XOP-LABEL: splatvar_funnnel_v4i32:
1299 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1300 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1301 ; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
1302 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1303 ; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
1304 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1307 ; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1308 ; X86-SSE2: # %bb.0:
1309 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1310 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1311 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1312 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm3
1313 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1314 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm1
1315 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1316 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
1317 ; X86-SSE2-NEXT: retl
1318 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1319 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1323 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1324 ; SSE2-LABEL: splatvar_funnnel_v8i16:
1326 ; SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
1327 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1328 ; SSE2-NEXT: pand %xmm3, %xmm4
1329 ; SSE2-NEXT: psrlw %xmm4, %xmm1
1330 ; SSE2-NEXT: pandn %xmm3, %xmm2
1331 ; SSE2-NEXT: paddw %xmm0, %xmm0
1332 ; SSE2-NEXT: psllw %xmm2, %xmm0
1333 ; SSE2-NEXT: por %xmm1, %xmm0
1336 ; SSE41-LABEL: splatvar_funnnel_v8i16:
1338 ; SSE41-NEXT: pmovsxbq {{.*#+}} xmm3 = [15,0]
1339 ; SSE41-NEXT: movdqa %xmm2, %xmm4
1340 ; SSE41-NEXT: pand %xmm3, %xmm4
1341 ; SSE41-NEXT: psrlw %xmm4, %xmm1
1342 ; SSE41-NEXT: pandn %xmm3, %xmm2
1343 ; SSE41-NEXT: paddw %xmm0, %xmm0
1344 ; SSE41-NEXT: psllw %xmm2, %xmm0
1345 ; SSE41-NEXT: por %xmm1, %xmm0
1348 ; AVX-LABEL: splatvar_funnnel_v8i16:
1350 ; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1351 ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
1352 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1353 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
1354 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1355 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1356 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1359 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
1361 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1362 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1363 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1364 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
1365 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1366 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1367 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1368 ; AVX512F-NEXT: retq
1370 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1371 ; AVX512VL: # %bb.0:
1372 ; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1373 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1374 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1375 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
1376 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1377 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1378 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1379 ; AVX512VL-NEXT: retq
1381 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1382 ; AVX512BW: # %bb.0:
1383 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1384 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1385 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1386 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1387 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1388 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1389 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1390 ; AVX512BW-NEXT: retq
1392 ; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1393 ; AVX512VBMI2: # %bb.0:
1394 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1395 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1396 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1397 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1398 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1399 ; AVX512VBMI2-NEXT: vzeroupper
1400 ; AVX512VBMI2-NEXT: retq
1402 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1403 ; AVX512VLBW: # %bb.0:
1404 ; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1405 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1406 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1407 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
1408 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1409 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1410 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1411 ; AVX512VLBW-NEXT: retq
1413 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1414 ; AVX512VLVBMI2: # %bb.0:
1415 ; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1416 ; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1417 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1418 ; AVX512VLVBMI2-NEXT: retq
1420 ; XOP-LABEL: splatvar_funnnel_v8i16:
1422 ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
1423 ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
1424 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
1425 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
1426 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1427 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
1428 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1431 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1432 ; X86-SSE2: # %bb.0:
1433 ; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0]
1434 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
1435 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1436 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
1437 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2
1438 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1439 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0
1440 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1441 ; X86-SSE2-NEXT: retl
1442 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1443 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1447 define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1448 ; SSE2-LABEL: splatvar_funnnel_v16i8:
1450 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1451 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1452 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1453 ; SSE2-NEXT: psrlw %xmm2, %xmm4
1454 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1455 ; SSE2-NEXT: pand %xmm3, %xmm4
1456 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1457 ; SSE2-NEXT: psrlw %xmm2, %xmm1
1458 ; SSE2-NEXT: pand %xmm1, %xmm3
1459 ; SSE2-NEXT: packuswb %xmm4, %xmm3
1460 ; SSE2-NEXT: movdqa %xmm3, %xmm0
1463 ; SSE41-LABEL: splatvar_funnnel_v16i8:
1465 ; SSE41-NEXT: movdqa %xmm1, %xmm4
1466 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1467 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1468 ; SSE41-NEXT: psrlw %xmm2, %xmm4
1469 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1470 ; SSE41-NEXT: pand %xmm3, %xmm4
1471 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1472 ; SSE41-NEXT: psrlw %xmm2, %xmm1
1473 ; SSE41-NEXT: pand %xmm1, %xmm3
1474 ; SSE41-NEXT: packuswb %xmm4, %xmm3
1475 ; SSE41-NEXT: movdqa %xmm3, %xmm0
1478 ; AVX1-LABEL: splatvar_funnnel_v16i8:
1480 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1481 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1482 ; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1483 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1484 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
1485 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1486 ; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1487 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1488 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1491 ; AVX2-LABEL: splatvar_funnnel_v16i8:
1493 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1494 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1495 ; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1496 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1497 ; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
1498 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1499 ; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1500 ; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1501 ; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1504 ; AVX512F-LABEL: splatvar_funnnel_v16i8:
1506 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1507 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1508 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1509 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1510 ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3
1511 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1512 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1513 ; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0
1514 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1515 ; AVX512F-NEXT: retq
1517 ; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1518 ; AVX512VL: # %bb.0:
1519 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1520 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1521 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1522 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1523 ; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
1524 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1525 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1526 ; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0
1527 ; AVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1528 ; AVX512VL-NEXT: retq
1530 ; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1531 ; AVX512BW: # %bb.0:
1532 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1533 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1534 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1535 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1536 ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3
1537 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1538 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1539 ; AVX512BW-NEXT: vpand %xmm4, %xmm0, %xmm0
1540 ; AVX512BW-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1541 ; AVX512BW-NEXT: retq
1543 ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1544 ; AVX512VBMI2: # %bb.0:
1545 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78]
1546 ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1547 ; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1548 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
1549 ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1550 ; AVX512VBMI2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1551 ; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
1552 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1553 ; AVX512VBMI2-NEXT: vzeroupper
1554 ; AVX512VBMI2-NEXT: retq
1556 ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1557 ; AVX512VLBW: # %bb.0:
1558 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1559 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1560 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1561 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1562 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1563 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1564 ; AVX512VLBW-NEXT: vzeroupper
1565 ; AVX512VLBW-NEXT: retq
1567 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1568 ; AVX512VLVBMI2: # %bb.0:
1569 ; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1570 ; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1571 ; AVX512VLVBMI2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1572 ; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1573 ; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
1574 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1575 ; AVX512VLVBMI2-NEXT: vzeroupper
1576 ; AVX512VLVBMI2-NEXT: retq
1578 ; XOP-LABEL: splatvar_funnnel_v16i8:
1580 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1581 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1582 ; XOP-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
1583 ; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1584 ; XOP-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
1585 ; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
1588 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1589 ; X86-SSE2: # %bb.0:
1590 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
1591 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1592 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1593 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm4
1594 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1595 ; X86-SSE2-NEXT: pand %xmm3, %xmm4
1596 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1597 ; X86-SSE2-NEXT: psrlw %xmm2, %xmm1
1598 ; X86-SSE2-NEXT: pand %xmm1, %xmm3
1599 ; X86-SSE2-NEXT: packuswb %xmm4, %xmm3
1600 ; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
1601 ; X86-SSE2-NEXT: retl
1602 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1603 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1611 define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1612 ; SSE2-LABEL: constant_funnnel_v2i64:
1614 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1615 ; SSE2-NEXT: psrlq $4, %xmm2
1616 ; SSE2-NEXT: psrlq $14, %xmm1
1617 ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1618 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1619 ; SSE2-NEXT: psllq $60, %xmm1
1620 ; SSE2-NEXT: psllq $50, %xmm0
1621 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1622 ; SSE2-NEXT: orpd %xmm2, %xmm0
1625 ; SSE41-LABEL: constant_funnnel_v2i64:
1627 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1628 ; SSE41-NEXT: psrlq $14, %xmm2
1629 ; SSE41-NEXT: psrlq $4, %xmm1
1630 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1631 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1632 ; SSE41-NEXT: psllq $50, %xmm1
1633 ; SSE41-NEXT: psllq $60, %xmm0
1634 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1635 ; SSE41-NEXT: por %xmm2, %xmm0
1638 ; AVX1-LABEL: constant_funnnel_v2i64:
1640 ; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
1641 ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
1642 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1643 ; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
1644 ; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
1645 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1646 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1649 ; AVX2-LABEL: constant_funnnel_v2i64:
1651 ; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1652 ; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1653 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1656 ; AVX512F-LABEL: constant_funnnel_v2i64:
1658 ; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1659 ; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1660 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1661 ; AVX512F-NEXT: retq
1663 ; AVX512VL-LABEL: constant_funnnel_v2i64:
1664 ; AVX512VL: # %bb.0:
1665 ; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1666 ; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1667 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1668 ; AVX512VL-NEXT: retq
1670 ; AVX512BW-LABEL: constant_funnnel_v2i64:
1671 ; AVX512BW: # %bb.0:
1672 ; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1673 ; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1674 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1675 ; AVX512BW-NEXT: retq
1677 ; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1678 ; AVX512VBMI2: # %bb.0:
1679 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1680 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1681 ; AVX512VBMI2-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,14]
1682 ; AVX512VBMI2-NEXT: vpshrdvq %zmm2, %zmm0, %zmm1
1683 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1684 ; AVX512VBMI2-NEXT: vzeroupper
1685 ; AVX512VBMI2-NEXT: retq
1687 ; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1688 ; AVX512VLBW: # %bb.0:
1689 ; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1690 ; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1691 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1692 ; AVX512VLBW-NEXT: retq
1694 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1695 ; AVX512VLVBMI2: # %bb.0:
1696 ; AVX512VLVBMI2-NEXT: vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1697 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1698 ; AVX512VLVBMI2-NEXT: retq
1700 ; XOPAVX1-LABEL: constant_funnnel_v2i64:
1702 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1703 ; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1704 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1705 ; XOPAVX1-NEXT: retq
1707 ; XOPAVX2-LABEL: constant_funnnel_v2i64:
1709 ; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1710 ; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1711 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1712 ; XOPAVX2-NEXT: retq
1714 ; X86-SSE2-LABEL: constant_funnnel_v2i64:
1715 ; X86-SSE2: # %bb.0:
1716 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1717 ; X86-SSE2-NEXT: psrlq $4, %xmm2
1718 ; X86-SSE2-NEXT: psrlq $14, %xmm1
1719 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1720 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
1721 ; X86-SSE2-NEXT: psllq $60, %xmm1
1722 ; X86-SSE2-NEXT: psllq $50, %xmm0
1723 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1724 ; X86-SSE2-NEXT: orpd %xmm2, %xmm0
1725 ; X86-SSE2-NEXT: retl
1726 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1730 define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1731 ; SSE2-LABEL: constant_funnnel_v4i32:
1733 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1734 ; SSE2-NEXT: psrld $7, %xmm2
1735 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1736 ; SSE2-NEXT: psrld $6, %xmm3
1737 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1738 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1739 ; SSE2-NEXT: psrld $5, %xmm2
1740 ; SSE2-NEXT: psrld $4, %xmm1
1741 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1742 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1743 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1744 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1745 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1746 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1747 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1748 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1749 ; SSE2-NEXT: por %xmm1, %xmm0
1752 ; SSE41-LABEL: constant_funnnel_v4i32:
1754 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1755 ; SSE41-NEXT: psrld $7, %xmm2
1756 ; SSE41-NEXT: movdqa %xmm1, %xmm3
1757 ; SSE41-NEXT: psrld $5, %xmm3
1758 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1759 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1760 ; SSE41-NEXT: psrld $6, %xmm2
1761 ; SSE41-NEXT: psrld $4, %xmm1
1762 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1763 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1764 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1765 ; SSE41-NEXT: por %xmm2, %xmm0
1768 ; AVX1-LABEL: constant_funnnel_v4i32:
1770 ; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
1771 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
1772 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1773 ; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
1774 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
1775 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1776 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1777 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1778 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1781 ; AVX2-LABEL: constant_funnnel_v4i32:
1783 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1784 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1788 ; AVX512F-LABEL: constant_funnnel_v4i32:
1790 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1791 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1792 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1793 ; AVX512F-NEXT: retq
1795 ; AVX512VL-LABEL: constant_funnnel_v4i32:
1796 ; AVX512VL: # %bb.0:
1797 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1798 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1799 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1800 ; AVX512VL-NEXT: retq
1802 ; AVX512BW-LABEL: constant_funnnel_v4i32:
1803 ; AVX512BW: # %bb.0:
1804 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1805 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1806 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1807 ; AVX512BW-NEXT: retq
1809 ; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1810 ; AVX512VBMI2: # %bb.0:
1811 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1812 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1813 ; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7]
1814 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
1815 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1816 ; AVX512VBMI2-NEXT: vzeroupper
1817 ; AVX512VBMI2-NEXT: retq
1819 ; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1820 ; AVX512VLBW: # %bb.0:
1821 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1822 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1823 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1824 ; AVX512VLBW-NEXT: retq
1826 ; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1827 ; AVX512VLVBMI2: # %bb.0:
1828 ; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1829 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1830 ; AVX512VLVBMI2-NEXT: retq
1832 ; XOPAVX1-LABEL: constant_funnnel_v4i32:
1834 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1835 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1836 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
1837 ; XOPAVX1-NEXT: retq
1839 ; XOPAVX2-LABEL: constant_funnnel_v4i32:
1841 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1842 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1843 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1844 ; XOPAVX2-NEXT: retq
1846 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
1847 ; X86-SSE2: # %bb.0:
1848 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1849 ; X86-SSE2-NEXT: psrld $7, %xmm2
1850 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
1851 ; X86-SSE2-NEXT: psrld $6, %xmm3
1852 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1853 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
1854 ; X86-SSE2-NEXT: psrld $5, %xmm2
1855 ; X86-SSE2-NEXT: psrld $4, %xmm1
1856 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1857 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1858 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1859 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1860 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1861 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1862 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1863 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1864 ; X86-SSE2-NEXT: por %xmm1, %xmm0
1865 ; X86-SSE2-NEXT: retl
1866 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1870 define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1871 ; SSE2-LABEL: constant_funnnel_v8i16:
1873 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1874 ; SSE2-NEXT: pandn %xmm1, %xmm2
1875 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1876 ; SSE2-NEXT: por %xmm1, %xmm2
1877 ; SSE2-NEXT: paddw %xmm0, %xmm0
1878 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1879 ; SSE2-NEXT: por %xmm2, %xmm0
1882 ; SSE41-LABEL: constant_funnnel_v8i16:
1884 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,32768,16384,8192,4096,2048,1024,512]
1885 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
1886 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1887 ; SSE41-NEXT: paddw %xmm0, %xmm0
1888 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1889 ; SSE41-NEXT: por %xmm2, %xmm0
1892 ; AVX-LABEL: constant_funnnel_v8i16:
1894 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1895 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1896 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1897 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1898 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
1901 ; AVX512F-LABEL: constant_funnnel_v8i16:
1903 ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1904 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1905 ; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1906 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1907 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
1908 ; AVX512F-NEXT: retq
1910 ; AVX512VL-LABEL: constant_funnnel_v8i16:
1911 ; AVX512VL: # %bb.0:
1912 ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1913 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1914 ; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1915 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1916 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
1917 ; AVX512VL-NEXT: retq
1919 ; AVX512BW-LABEL: constant_funnnel_v8i16:
1920 ; AVX512BW: # %bb.0:
1921 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1922 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1923 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
1924 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1925 ; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1926 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
1927 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1928 ; AVX512BW-NEXT: vzeroupper
1929 ; AVX512BW-NEXT: retq
1931 ; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1932 ; AVX512VBMI2: # %bb.0:
1933 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1934 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1935 ; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1936 ; AVX512VBMI2-NEXT: vpshrdvw %zmm2, %zmm0, %zmm1
1937 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
1938 ; AVX512VBMI2-NEXT: vzeroupper
1939 ; AVX512VBMI2-NEXT: retq
1941 ; AVX512VLBW-LABEL: constant_funnnel_v8i16:
1942 ; AVX512VLBW: # %bb.0:
1943 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1944 ; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1945 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1946 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
1947 ; AVX512VLBW-NEXT: retq
1949 ; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
1950 ; AVX512VLVBMI2: # %bb.0:
1951 ; AVX512VLVBMI2-NEXT: vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1952 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1953 ; AVX512VLVBMI2-NEXT: retq
1955 ; XOP-LABEL: constant_funnnel_v8i16:
1957 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1958 ; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
1959 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1960 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
1963 ; X86-SSE2-LABEL: constant_funnnel_v8i16:
1964 ; X86-SSE2: # %bb.0:
1965 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1966 ; X86-SSE2-NEXT: pandn %xmm1, %xmm2
1967 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1968 ; X86-SSE2-NEXT: por %xmm1, %xmm2
1969 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0
1970 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1971 ; X86-SSE2-NEXT: por %xmm2, %xmm0
1972 ; X86-SSE2-NEXT: retl
1973 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
1977 define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
1978 ; SSE2-LABEL: constant_funnnel_v16i8:
1980 ; SSE2-NEXT: pxor %xmm2, %xmm2
1981 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1982 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1983 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,4,8,16,32,64,128]
1984 ; SSE2-NEXT: psrlw $8, %xmm3
1985 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1986 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
1987 ; SSE2-NEXT: psrlw $8, %xmm1
1988 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1989 ; SSE2-NEXT: paddb %xmm0, %xmm0
1990 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1991 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1992 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,1,2,4,8,16,32,64]
1993 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1994 ; SSE2-NEXT: pand %xmm3, %xmm2
1995 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1996 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1]
1997 ; SSE2-NEXT: pand %xmm3, %xmm0
1998 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1999 ; SSE2-NEXT: por %xmm1, %xmm0
2002 ; SSE41-LABEL: constant_funnnel_v16i8:
2004 ; SSE41-NEXT: pxor %xmm2, %xmm2
2005 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2006 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2007 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,2,4,8,16,32,64,128]
2008 ; SSE41-NEXT: psrlw $8, %xmm1
2009 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,128,64,32,16,8,4,2]
2010 ; SSE41-NEXT: psrlw $8, %xmm3
2011 ; SSE41-NEXT: packuswb %xmm1, %xmm3
2012 ; SSE41-NEXT: paddb %xmm0, %xmm0
2013 ; SSE41-NEXT: movdqa %xmm0, %xmm1
2014 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
2015 ; SSE41-NEXT: psllw $8, %xmm1
2016 ; SSE41-NEXT: por %xmm3, %xmm1
2017 ; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
2018 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2019 ; SSE41-NEXT: por %xmm1, %xmm0
2022 ; AVX1-LABEL: constant_funnnel_v16i8:
2024 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2025 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2026 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,2,4,8,16,32,64,128]
2027 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2028 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2029 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,128,64,32,16,8,4,2]
2030 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
2031 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2032 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2033 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
2034 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
2035 ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
2036 ; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
2037 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2038 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2041 ; AVX2-LABEL: constant_funnnel_v16i8:
2043 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2044 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,128,64,32,16,8,4,2,256,2,4,8,16,32,64,128]
2045 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
2046 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2047 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
2048 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2049 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2050 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
2051 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2052 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2053 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2054 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2055 ; AVX2-NEXT: vzeroupper
2058 ; AVX512F-LABEL: constant_funnnel_v16i8:
2060 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2061 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2062 ; AVX512F-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2063 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2064 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2065 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
2066 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2067 ; AVX512F-NEXT: vzeroupper
2068 ; AVX512F-NEXT: retq
2070 ; AVX512VL-LABEL: constant_funnnel_v16i8:
2071 ; AVX512VL: # %bb.0:
2072 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2073 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2074 ; AVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2075 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2076 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2077 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
2078 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2079 ; AVX512VL-NEXT: vzeroupper
2080 ; AVX512VL-NEXT: retq
2082 ; AVX512BW-LABEL: constant_funnnel_v16i8:
2083 ; AVX512BW: # %bb.0:
2084 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2085 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2086 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2087 ; AVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
2088 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
2089 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
2090 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2091 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2092 ; AVX512BW-NEXT: vzeroupper
2093 ; AVX512BW-NEXT: retq
2095 ; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2096 ; AVX512VBMI2: # %bb.0:
2097 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2098 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2099 ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2100 ; AVX512VBMI2-NEXT: vpermt2b %zmm0, %zmm2, %zmm1
2101 ; AVX512VBMI2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2102 ; AVX512VBMI2-NEXT: vpsrlvw %zmm0, %zmm1, %zmm0
2103 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2104 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2105 ; AVX512VBMI2-NEXT: vzeroupper
2106 ; AVX512VBMI2-NEXT: retq
2108 ; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2109 ; AVX512VLBW: # %bb.0:
2110 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2111 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2112 ; AVX512VLBW-NEXT: vpsllw $8, %ymm0, %ymm0
2113 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
2114 ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2115 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2116 ; AVX512VLBW-NEXT: vzeroupper
2117 ; AVX512VLBW-NEXT: retq
2119 ; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2120 ; AVX512VLVBMI2: # %bb.0:
2121 ; AVX512VLVBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2122 ; AVX512VLVBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2123 ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2124 ; AVX512VLVBMI2-NEXT: vpermi2b %ymm0, %ymm1, %ymm2
2125 ; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2126 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2127 ; AVX512VLVBMI2-NEXT: vzeroupper
2128 ; AVX512VLVBMI2-NEXT: retq
2130 ; XOP-LABEL: constant_funnnel_v16i8:
2132 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2133 ; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0
2134 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2135 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2138 ; X86-SSE2-LABEL: constant_funnnel_v16i8:
2139 ; X86-SSE2: # %bb.0:
2140 ; X86-SSE2-NEXT: pxor %xmm2, %xmm2
2141 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
2142 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2143 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 # [256,2,4,8,16,32,64,128]
2144 ; X86-SSE2-NEXT: psrlw $8, %xmm3
2145 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2146 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [256,128,64,32,16,8,4,2]
2147 ; X86-SSE2-NEXT: psrlw $8, %xmm1
2148 ; X86-SSE2-NEXT: packuswb %xmm3, %xmm1
2149 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0
2150 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
2151 ; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2152 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,1,2,4,8,16,32,64]
2153 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2154 ; X86-SSE2-NEXT: pand %xmm3, %xmm2
2155 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2156 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,64,32,16,8,4,2,1]
2157 ; X86-SSE2-NEXT: pand %xmm3, %xmm0
2158 ; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
2159 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2160 ; X86-SSE2-NEXT: retl
2161 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2166 ; Uniform Constant Shifts
2169 define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2170 ; SSE-LABEL: splatconstant_funnnel_v2i64:
2172 ; SSE-NEXT: psrlq $14, %xmm1
2173 ; SSE-NEXT: psllq $50, %xmm0
2174 ; SSE-NEXT: por %xmm1, %xmm0
2177 ; AVX-LABEL: splatconstant_funnnel_v2i64:
2179 ; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2180 ; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2181 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2184 ; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2186 ; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2187 ; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2188 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2189 ; AVX512F-NEXT: retq
2191 ; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2192 ; AVX512VL: # %bb.0:
2193 ; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2194 ; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2195 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2196 ; AVX512VL-NEXT: retq
2198 ; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2199 ; AVX512BW: # %bb.0:
2200 ; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2201 ; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2202 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2203 ; AVX512BW-NEXT: retq
2205 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2206 ; AVX512VBMI2: # %bb.0:
2207 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2208 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2209 ; AVX512VBMI2-NEXT: vpshrdq $14, %zmm0, %zmm1, %zmm0
2210 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2211 ; AVX512VBMI2-NEXT: vzeroupper
2212 ; AVX512VBMI2-NEXT: retq
2214 ; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2215 ; AVX512VLBW: # %bb.0:
2216 ; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2217 ; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2218 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2219 ; AVX512VLBW-NEXT: retq
2221 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2222 ; AVX512VLVBMI2: # %bb.0:
2223 ; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2224 ; AVX512VLVBMI2-NEXT: retq
2226 ; XOP-LABEL: splatconstant_funnnel_v2i64:
2228 ; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2229 ; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2230 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2233 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2234 ; X86-SSE2: # %bb.0:
2235 ; X86-SSE2-NEXT: psrlq $14, %xmm1
2236 ; X86-SSE2-NEXT: psllq $50, %xmm0
2237 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2238 ; X86-SSE2-NEXT: retl
2239 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2243 define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2244 ; SSE-LABEL: splatconstant_funnnel_v4i32:
2246 ; SSE-NEXT: psrld $4, %xmm1
2247 ; SSE-NEXT: pslld $28, %xmm0
2248 ; SSE-NEXT: por %xmm1, %xmm0
2251 ; AVX-LABEL: splatconstant_funnnel_v4i32:
2253 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2254 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2255 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2258 ; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2260 ; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2261 ; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2262 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2263 ; AVX512F-NEXT: retq
2265 ; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2266 ; AVX512VL: # %bb.0:
2267 ; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2268 ; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2269 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2270 ; AVX512VL-NEXT: retq
2272 ; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2273 ; AVX512BW: # %bb.0:
2274 ; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2275 ; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2276 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2277 ; AVX512BW-NEXT: retq
2279 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2280 ; AVX512VBMI2: # %bb.0:
2281 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2282 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2283 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
2284 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2285 ; AVX512VBMI2-NEXT: vzeroupper
2286 ; AVX512VBMI2-NEXT: retq
2288 ; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2289 ; AVX512VLBW: # %bb.0:
2290 ; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2291 ; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2292 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2293 ; AVX512VLBW-NEXT: retq
2295 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2296 ; AVX512VLVBMI2: # %bb.0:
2297 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2298 ; AVX512VLVBMI2-NEXT: retq
2300 ; XOP-LABEL: splatconstant_funnnel_v4i32:
2302 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2303 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2304 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2307 ; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2308 ; X86-SSE2: # %bb.0:
2309 ; X86-SSE2-NEXT: psrld $4, %xmm1
2310 ; X86-SSE2-NEXT: pslld $28, %xmm0
2311 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2312 ; X86-SSE2-NEXT: retl
2313 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2317 define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2318 ; SSE-LABEL: splatconstant_funnnel_v8i16:
2320 ; SSE-NEXT: psrlw $7, %xmm1
2321 ; SSE-NEXT: psllw $9, %xmm0
2322 ; SSE-NEXT: por %xmm1, %xmm0
2325 ; AVX-LABEL: splatconstant_funnnel_v8i16:
2327 ; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2328 ; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2329 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2332 ; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2334 ; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2335 ; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2336 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2337 ; AVX512F-NEXT: retq
2339 ; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2340 ; AVX512VL: # %bb.0:
2341 ; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2342 ; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2343 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2344 ; AVX512VL-NEXT: retq
2346 ; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2347 ; AVX512BW: # %bb.0:
2348 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2349 ; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2350 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2351 ; AVX512BW-NEXT: retq
2353 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2354 ; AVX512VBMI2: # %bb.0:
2355 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2356 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2357 ; AVX512VBMI2-NEXT: vpshrdw $7, %zmm0, %zmm1, %zmm0
2358 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2359 ; AVX512VBMI2-NEXT: vzeroupper
2360 ; AVX512VBMI2-NEXT: retq
2362 ; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2363 ; AVX512VLBW: # %bb.0:
2364 ; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2365 ; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
2366 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2367 ; AVX512VLBW-NEXT: retq
2369 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2370 ; AVX512VLVBMI2: # %bb.0:
2371 ; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
2372 ; AVX512VLVBMI2-NEXT: retq
2374 ; XOP-LABEL: splatconstant_funnnel_v8i16:
2376 ; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2377 ; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2378 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2381 ; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2382 ; X86-SSE2: # %bb.0:
2383 ; X86-SSE2-NEXT: psrlw $7, %xmm1
2384 ; X86-SSE2-NEXT: psllw $9, %xmm0
2385 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2386 ; X86-SSE2-NEXT: retl
2387 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2391 define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2392 ; SSE-LABEL: splatconstant_funnnel_v16i8:
2394 ; SSE-NEXT: psrlw $4, %xmm1
2395 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2396 ; SSE-NEXT: psllw $4, %xmm0
2397 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2398 ; SSE-NEXT: por %xmm1, %xmm0
2401 ; AVX-LABEL: splatconstant_funnnel_v16i8:
2403 ; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2404 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2405 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2406 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2407 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2410 ; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2412 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2
2413 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0
2414 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2415 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2416 ; AVX512F-NEXT: vzeroupper
2417 ; AVX512F-NEXT: retq
2419 ; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2420 ; AVX512VL: # %bb.0:
2421 ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2
2422 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0
2423 ; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2424 ; AVX512VL-NEXT: retq
2426 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2427 ; AVX512BW: # %bb.0:
2428 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2
2429 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0
2430 ; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2431 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2432 ; AVX512BW-NEXT: vzeroupper
2433 ; AVX512BW-NEXT: retq
2435 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2436 ; AVX512VBMI2: # %bb.0:
2437 ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2438 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2439 ; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
2440 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2441 ; AVX512VBMI2-NEXT: vzeroupper
2442 ; AVX512VBMI2-NEXT: retq
2444 ; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2445 ; AVX512VLBW: # %bb.0:
2446 ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2
2447 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0
2448 ; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2449 ; AVX512VLBW-NEXT: retq
2451 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2452 ; AVX512VLVBMI2: # %bb.0:
2453 ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2
2454 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0
2455 ; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
2456 ; AVX512VLVBMI2-NEXT: retq
2458 ; XOP-LABEL: splatconstant_funnnel_v16i8:
2460 ; XOP-NEXT: vpsrlw $4, %xmm1, %xmm1
2461 ; XOP-NEXT: vpsllw $4, %xmm0, %xmm0
2462 ; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0
2465 ; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2466 ; X86-SSE2: # %bb.0:
2467 ; X86-SSE2-NEXT: psrlw $4, %xmm1
2468 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2469 ; X86-SSE2-NEXT: psllw $4, %xmm0
2470 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2471 ; X86-SSE2-NEXT: por %xmm1, %xmm0
2472 ; X86-SSE2-NEXT: retl
2473 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)