1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VLBW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
18 declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
24 define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
25 ; SSE2-LABEL: var_funnnel_v2i32:
27 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
28 ; SSE2-NEXT: movdqa %xmm2, %xmm5
29 ; SSE2-NEXT: pandn %xmm4, %xmm5
30 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
31 ; SSE2-NEXT: psrld $1, %xmm1
32 ; SSE2-NEXT: movdqa %xmm1, %xmm6
33 ; SSE2-NEXT: psrld %xmm3, %xmm6
34 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
35 ; SSE2-NEXT: movdqa %xmm1, %xmm3
36 ; SSE2-NEXT: psrld %xmm7, %xmm3
37 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
38 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
39 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
40 ; SSE2-NEXT: movdqa %xmm1, %xmm7
41 ; SSE2-NEXT: psrld %xmm6, %xmm7
42 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
43 ; SSE2-NEXT: psrld %xmm5, %xmm1
44 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
45 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
46 ; SSE2-NEXT: pand %xmm4, %xmm2
47 ; SSE2-NEXT: pslld $23, %xmm2
48 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
49 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
51 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
52 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
53 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
54 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
55 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
56 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
57 ; SSE2-NEXT: por %xmm3, %xmm0
60 ; SSE41-LABEL: var_funnnel_v2i32:
62 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
63 ; SSE41-NEXT: movdqa %xmm2, %xmm4
64 ; SSE41-NEXT: pandn %xmm3, %xmm4
65 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
66 ; SSE41-NEXT: psrld $1, %xmm1
67 ; SSE41-NEXT: movdqa %xmm1, %xmm6
68 ; SSE41-NEXT: psrld %xmm5, %xmm6
69 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
70 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
71 ; SSE41-NEXT: movdqa %xmm1, %xmm8
72 ; SSE41-NEXT: psrld %xmm7, %xmm8
73 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
74 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
75 ; SSE41-NEXT: movdqa %xmm1, %xmm6
76 ; SSE41-NEXT: psrld %xmm4, %xmm6
77 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
78 ; SSE41-NEXT: psrld %xmm4, %xmm1
79 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
80 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
81 ; SSE41-NEXT: pand %xmm3, %xmm2
82 ; SSE41-NEXT: pslld $23, %xmm2
83 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
84 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
85 ; SSE41-NEXT: pmulld %xmm1, %xmm0
86 ; SSE41-NEXT: por %xmm6, %xmm0
89 ; AVX1-LABEL: var_funnnel_v2i32:
91 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
92 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
93 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
94 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
95 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
96 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
97 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
98 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
99 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
100 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
101 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
102 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
103 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
104 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
105 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
106 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
107 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
108 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
109 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
110 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
111 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
114 ; AVX2-LABEL: var_funnnel_v2i32:
116 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
117 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
118 ; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
119 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
120 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
121 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
122 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
125 ; AVX512F-LABEL: var_funnnel_v2i32:
127 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
128 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
129 ; AVX512F-NEXT: vpsrld $1, %xmm1, %xmm1
130 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
131 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
132 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
133 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
136 ; AVX512VL-LABEL: var_funnnel_v2i32:
138 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
139 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
140 ; AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1
141 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
142 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
143 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
144 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
145 ; AVX512VL-NEXT: retq
147 ; AVX512BW-LABEL: var_funnnel_v2i32:
149 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
150 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
151 ; AVX512BW-NEXT: vpsrld $1, %xmm1, %xmm1
152 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
153 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
154 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
155 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
156 ; AVX512BW-NEXT: retq
158 ; AVX512VLBW-LABEL: var_funnnel_v2i32:
159 ; AVX512VLBW: # %bb.0:
160 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
161 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
162 ; AVX512VLBW-NEXT: vpsrld $1, %xmm1, %xmm1
163 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
164 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
165 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
166 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
167 ; AVX512VLBW-NEXT: retq
169 ; AVX512VBMI2-LABEL: var_funnnel_v2i32:
170 ; AVX512VBMI2: # %bb.0:
171 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
172 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
173 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
174 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
175 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
176 ; AVX512VBMI2-NEXT: vzeroupper
177 ; AVX512VBMI2-NEXT: retq
179 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i32:
180 ; AVX512VLVBMI2: # %bb.0:
181 ; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
182 ; AVX512VLVBMI2-NEXT: retq
184 ; XOPAVX1-LABEL: var_funnnel_v2i32:
186 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
187 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
188 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
189 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
190 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
191 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
192 ; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1
193 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
194 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
197 ; XOPAVX2-LABEL: var_funnnel_v2i32:
199 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
200 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
201 ; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1
202 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
203 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
204 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
205 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
208 ; X86-SSE2-LABEL: var_funnnel_v2i32:
210 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
211 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
212 ; X86-SSE2-NEXT: pandn %xmm4, %xmm5
213 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
214 ; X86-SSE2-NEXT: psrld $1, %xmm1
215 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
216 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
217 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
218 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
219 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
220 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
221 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
222 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
223 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
224 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
225 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
226 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
227 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
228 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
229 ; X86-SSE2-NEXT: pand %xmm4, %xmm2
230 ; X86-SSE2-NEXT: pslld $23, %xmm2
231 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
232 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
233 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
234 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
235 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
236 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
237 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
238 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
239 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
240 ; X86-SSE2-NEXT: por %xmm3, %xmm0
241 ; X86-SSE2-NEXT: retl
242 %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
247 ; Uniform Variable Shifts
250 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
251 ; SSE-LABEL: splatvar_funnnel_v2i32:
253 ; SSE-NEXT: movdqa %xmm1, %xmm3
254 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
255 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
256 ; SSE-NEXT: psllq %xmm2, %xmm3
257 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
258 ; SSE-NEXT: psllq %xmm2, %xmm1
259 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
260 ; SSE-NEXT: movaps %xmm1, %xmm0
263 ; AVX-LABEL: splatvar_funnnel_v2i32:
265 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
266 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
267 ; AVX-NEXT: vpsllq %xmm2, %xmm3, %xmm3
268 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
269 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
270 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
273 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
274 ; AVX512VBMI2: # %bb.0:
275 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
276 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
277 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
278 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
279 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
280 ; AVX512VBMI2-NEXT: vzeroupper
281 ; AVX512VBMI2-NEXT: retq
283 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32:
284 ; AVX512VLVBMI2: # %bb.0:
285 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
286 ; AVX512VLVBMI2-NEXT: vpshldvd %xmm2, %xmm1, %xmm0
287 ; AVX512VLVBMI2-NEXT: retq
289 ; XOP-LABEL: splatvar_funnnel_v2i32:
291 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
292 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
293 ; XOP-NEXT: vpsllq %xmm2, %xmm3, %xmm3
294 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
295 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
296 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
299 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
301 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
302 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
303 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
304 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3
305 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
306 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1
307 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
308 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
309 ; X86-SSE2-NEXT: retl
310 %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer
311 %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %splat)
319 define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
320 ; SSE2-LABEL: constant_funnnel_v2i32:
322 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
323 ; SSE2-NEXT: psrld $28, %xmm1
324 ; SSE2-NEXT: psrld $27, %xmm2
325 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
326 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
327 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
328 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
329 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
330 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
331 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
332 ; SSE2-NEXT: por %xmm1, %xmm0
335 ; SSE41-LABEL: constant_funnnel_v2i32:
337 ; SSE41-NEXT: movdqa %xmm1, %xmm2
338 ; SSE41-NEXT: psrld $27, %xmm2
339 ; SSE41-NEXT: psrld $28, %xmm1
340 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
341 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
342 ; SSE41-NEXT: por %xmm2, %xmm0
345 ; AVX1-LABEL: constant_funnnel_v2i32:
347 ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
348 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
349 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
350 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
351 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
354 ; AVX2-LABEL: constant_funnnel_v2i32:
356 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
357 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
358 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
361 ; AVX512F-LABEL: constant_funnnel_v2i32:
363 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
364 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
365 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
368 ; AVX512VL-LABEL: constant_funnnel_v2i32:
370 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
371 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
372 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
373 ; AVX512VL-NEXT: retq
375 ; AVX512BW-LABEL: constant_funnnel_v2i32:
377 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
378 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
379 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
380 ; AVX512BW-NEXT: retq
382 ; AVX512VLBW-LABEL: constant_funnnel_v2i32:
383 ; AVX512VLBW: # %bb.0:
384 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
385 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
386 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
387 ; AVX512VLBW-NEXT: retq
389 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
390 ; AVX512VBMI2: # %bb.0:
391 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
392 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
393 ; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0]
394 ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0
395 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
396 ; AVX512VBMI2-NEXT: vzeroupper
397 ; AVX512VBMI2-NEXT: retq
399 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32:
400 ; AVX512VLVBMI2: # %bb.0:
401 ; AVX512VLVBMI2-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
402 ; AVX512VLVBMI2-NEXT: retq
404 ; XOPAVX1-LABEL: constant_funnnel_v2i32:
406 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
407 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
408 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
411 ; XOPAVX2-LABEL: constant_funnnel_v2i32:
413 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
414 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
415 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
418 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
420 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
421 ; X86-SSE2-NEXT: psrld $28, %xmm1
422 ; X86-SSE2-NEXT: psrld $27, %xmm2
423 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
424 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
425 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
426 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
427 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
428 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
429 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
430 ; X86-SSE2-NEXT: por %xmm1, %xmm0
431 ; X86-SSE2-NEXT: retl
432 %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
437 ; Uniform Constant Shifts
440 define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
441 ; SSE-LABEL: splatconstant_funnnel_v2i32:
443 ; SSE-NEXT: psrld $28, %xmm1
444 ; SSE-NEXT: pslld $4, %xmm0
445 ; SSE-NEXT: por %xmm1, %xmm0
448 ; AVX-LABEL: splatconstant_funnnel_v2i32:
450 ; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
451 ; AVX-NEXT: vpslld $4, %xmm0, %xmm0
452 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
455 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32:
456 ; AVX512VBMI2: # %bb.0:
457 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
458 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
459 ; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0
460 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
461 ; AVX512VBMI2-NEXT: vzeroupper
462 ; AVX512VBMI2-NEXT: retq
464 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32:
465 ; AVX512VLVBMI2: # %bb.0:
466 ; AVX512VLVBMI2-NEXT: vpshldd $4, %xmm1, %xmm0, %xmm0
467 ; AVX512VLVBMI2-NEXT: retq
469 ; XOP-LABEL: splatconstant_funnnel_v2i32:
471 ; XOP-NEXT: vpsrld $28, %xmm1, %xmm1
472 ; XOP-NEXT: vpslld $4, %xmm0, %xmm0
473 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
476 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i32:
478 ; X86-SSE2-NEXT: psrld $28, %xmm1
479 ; X86-SSE2-NEXT: pslld $4, %xmm0
480 ; X86-SSE2-NEXT: por %xmm1, %xmm0
481 ; X86-SSE2-NEXT: retl
482 %res = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 4>)