1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512BW
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VLBW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
15 ; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
18 declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
24 define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
25 ; SSE2-LABEL: var_funnnel_v2i32:
27 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
28 ; SSE2-NEXT: movdqa %xmm2, %xmm5
29 ; SSE2-NEXT: pand %xmm4, %xmm5
30 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
31 ; SSE2-NEXT: movdqa %xmm1, %xmm6
32 ; SSE2-NEXT: psrld %xmm3, %xmm6
33 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
34 ; SSE2-NEXT: movdqa %xmm1, %xmm3
35 ; SSE2-NEXT: psrld %xmm7, %xmm3
36 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
37 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
38 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
39 ; SSE2-NEXT: movdqa %xmm1, %xmm7
40 ; SSE2-NEXT: psrld %xmm6, %xmm7
41 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
42 ; SSE2-NEXT: psrld %xmm5, %xmm1
43 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
44 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
45 ; SSE2-NEXT: pandn %xmm4, %xmm2
46 ; SSE2-NEXT: pslld $23, %xmm2
47 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
48 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
49 ; SSE2-NEXT: paddd %xmm0, %xmm0
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
51 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
52 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
53 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
54 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
55 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
56 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
57 ; SSE2-NEXT: por %xmm3, %xmm0
60 ; SSE41-LABEL: var_funnnel_v2i32:
62 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
63 ; SSE41-NEXT: movdqa %xmm2, %xmm4
64 ; SSE41-NEXT: pand %xmm3, %xmm4
65 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
66 ; SSE41-NEXT: movdqa %xmm1, %xmm6
67 ; SSE41-NEXT: psrld %xmm5, %xmm6
68 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
69 ; SSE41-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
70 ; SSE41-NEXT: movdqa %xmm1, %xmm8
71 ; SSE41-NEXT: psrld %xmm7, %xmm8
72 ; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
73 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
74 ; SSE41-NEXT: movdqa %xmm1, %xmm6
75 ; SSE41-NEXT: psrld %xmm4, %xmm6
76 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
77 ; SSE41-NEXT: psrld %xmm4, %xmm1
78 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
79 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
80 ; SSE41-NEXT: pandn %xmm3, %xmm2
81 ; SSE41-NEXT: pslld $23, %xmm2
82 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
83 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
84 ; SSE41-NEXT: paddd %xmm0, %xmm0
85 ; SSE41-NEXT: pmulld %xmm1, %xmm0
86 ; SSE41-NEXT: por %xmm6, %xmm0
89 ; AVX1-LABEL: var_funnnel_v2i32:
91 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
92 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
93 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
94 ; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
95 ; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
96 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
97 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
98 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
99 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
100 ; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
101 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
102 ; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1
103 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
104 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
105 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
106 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
107 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
108 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
109 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
110 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
111 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
114 ; AVX2-LABEL: var_funnnel_v2i32:
116 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
117 ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
118 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
119 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
120 ; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
121 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
122 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
125 ; AVX512F-LABEL: var_funnnel_v2i32:
127 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
128 ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
129 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
130 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
131 ; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
132 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
133 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
136 ; AVX512VL-LABEL: var_funnnel_v2i32:
138 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
139 ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
140 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
141 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
142 ; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0
143 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
144 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
145 ; AVX512VL-NEXT: retq
147 ; AVX512BW-LABEL: var_funnnel_v2i32:
149 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
150 ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
151 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
152 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
153 ; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
154 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
155 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
156 ; AVX512BW-NEXT: retq
158 ; AVX512VLBW-LABEL: var_funnnel_v2i32:
159 ; AVX512VLBW: # %bb.0:
160 ; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
161 ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
162 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
163 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
164 ; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
165 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
166 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
167 ; AVX512VLBW-NEXT: retq
169 ; AVX512VBMI2-LABEL: var_funnnel_v2i32:
170 ; AVX512VBMI2: # %bb.0:
171 ; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
172 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
173 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
174 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
175 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
176 ; AVX512VBMI2-NEXT: vzeroupper
177 ; AVX512VBMI2-NEXT: retq
179 ; AVX512VLVBMI2-LABEL: var_funnnel_v2i32:
180 ; AVX512VLVBMI2: # %bb.0:
181 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
182 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
183 ; AVX512VLVBMI2-NEXT: retq
185 ; XOPAVX1-LABEL: var_funnnel_v2i32:
187 ; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
188 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
189 ; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
190 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
191 ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
192 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
193 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
194 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
195 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
198 ; XOPAVX2-LABEL: var_funnnel_v2i32:
200 ; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
201 ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
202 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
203 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
204 ; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
205 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
206 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
209 ; X86-SSE2-LABEL: var_funnnel_v2i32:
211 ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [31,31,31,31]
212 ; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
213 ; X86-SSE2-NEXT: pand %xmm4, %xmm5
214 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
215 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
216 ; X86-SSE2-NEXT: psrld %xmm3, %xmm6
217 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
218 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
219 ; X86-SSE2-NEXT: psrld %xmm7, %xmm3
220 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
221 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
222 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
223 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
224 ; X86-SSE2-NEXT: psrld %xmm6, %xmm7
225 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
226 ; X86-SSE2-NEXT: psrld %xmm5, %xmm1
227 ; X86-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
228 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
229 ; X86-SSE2-NEXT: pandn %xmm4, %xmm2
230 ; X86-SSE2-NEXT: pslld $23, %xmm2
231 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
232 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
233 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0
234 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
235 ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
236 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
237 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
238 ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
239 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
240 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
241 ; X86-SSE2-NEXT: por %xmm3, %xmm0
242 ; X86-SSE2-NEXT: retl
243 %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
248 ; Uniform Variable Shifts
251 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
252 ; SSE-LABEL: splatvar_funnnel_v2i32:
254 ; SSE-NEXT: movdqa %xmm1, %xmm3
255 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
256 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
257 ; SSE-NEXT: psrlq %xmm2, %xmm3
258 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
259 ; SSE-NEXT: psrlq %xmm2, %xmm1
260 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
261 ; SSE-NEXT: movaps %xmm1, %xmm0
264 ; AVX1-LABEL: splatvar_funnnel_v2i32:
266 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
267 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
268 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
269 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
270 ; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
271 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
274 ; AVX2-LABEL: splatvar_funnnel_v2i32:
276 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
277 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
278 ; AVX2-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
279 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
280 ; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
281 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
284 ; AVX512F-LABEL: splatvar_funnnel_v2i32:
286 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
287 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
288 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
289 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
290 ; AVX512F-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
291 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
294 ; AVX512VL-LABEL: splatvar_funnnel_v2i32:
296 ; AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
297 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
298 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
299 ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
300 ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
301 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
302 ; AVX512VL-NEXT: vzeroupper
303 ; AVX512VL-NEXT: retq
305 ; AVX512BW-LABEL: splatvar_funnnel_v2i32:
307 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
308 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
309 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
310 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
311 ; AVX512BW-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
312 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
313 ; AVX512BW-NEXT: retq
315 ; AVX512VLBW-LABEL: splatvar_funnnel_v2i32:
316 ; AVX512VLBW: # %bb.0:
317 ; AVX512VLBW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
318 ; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
319 ; AVX512VLBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
320 ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
321 ; AVX512VLBW-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
322 ; AVX512VLBW-NEXT: vpmovqd %ymm0, %xmm0
323 ; AVX512VLBW-NEXT: vzeroupper
324 ; AVX512VLBW-NEXT: retq
326 ; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
327 ; AVX512VBMI2: # %bb.0:
328 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
329 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
330 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
331 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
332 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
333 ; AVX512VBMI2-NEXT: vzeroupper
334 ; AVX512VBMI2-NEXT: retq
336 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32:
337 ; AVX512VLVBMI2: # %bb.0:
338 ; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
339 ; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
340 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
341 ; AVX512VLVBMI2-NEXT: retq
343 ; XOP-LABEL: splatvar_funnnel_v2i32:
345 ; XOP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
346 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
347 ; XOP-NEXT: vpsrlq %xmm2, %xmm3, %xmm3
348 ; XOP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
349 ; XOP-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
350 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
353 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
355 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
356 ; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
357 ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
358 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm3
359 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
360 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm1
361 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
362 ; X86-SSE2-NEXT: movaps %xmm1, %xmm0
363 ; X86-SSE2-NEXT: retl
364 %splat = shufflevector <2 x i32> %amt, <2 x i32> undef, <2 x i32> zeroinitializer
365 %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %splat)
373 define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
374 ; SSE2-LABEL: constant_funnnel_v2i32:
376 ; SSE2-NEXT: movdqa %xmm1, %xmm2
377 ; SSE2-NEXT: psrld $5, %xmm2
378 ; SSE2-NEXT: movdqa %xmm1, %xmm3
379 ; SSE2-NEXT: psrld $4, %xmm3
380 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
381 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
382 ; SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000
383 ; SSE2-NEXT: movd %eax, %xmm1
384 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
385 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
386 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
387 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
388 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
389 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
390 ; SSE2-NEXT: por %xmm3, %xmm1
391 ; SSE2-NEXT: movdqa %xmm1, %xmm0
394 ; SSE41-LABEL: constant_funnnel_v2i32:
396 ; SSE41-NEXT: movdqa %xmm1, %xmm2
397 ; SSE41-NEXT: psrld $5, %xmm2
398 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
399 ; SSE41-NEXT: movdqa %xmm1, %xmm3
400 ; SSE41-NEXT: psrld $4, %xmm3
401 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
402 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
403 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
404 ; SSE41-NEXT: por %xmm3, %xmm0
407 ; AVX1-LABEL: constant_funnnel_v2i32:
409 ; AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
410 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
411 ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3
412 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
413 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
414 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
415 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
418 ; AVX2-LABEL: constant_funnnel_v2i32:
420 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
421 ; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
422 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
425 ; AVX512F-LABEL: constant_funnnel_v2i32:
427 ; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
428 ; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
429 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
432 ; AVX512VL-LABEL: constant_funnnel_v2i32:
434 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
435 ; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
436 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
437 ; AVX512VL-NEXT: retq
439 ; AVX512BW-LABEL: constant_funnnel_v2i32:
441 ; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
442 ; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
443 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
444 ; AVX512BW-NEXT: retq
446 ; AVX512VLBW-LABEL: constant_funnnel_v2i32:
447 ; AVX512VLBW: # %bb.0:
448 ; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
449 ; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
450 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
451 ; AVX512VLBW-NEXT: retq
453 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
454 ; AVX512VBMI2: # %bb.0:
455 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
456 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
457 ; AVX512VBMI2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,0,0]
458 ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1
459 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
460 ; AVX512VBMI2-NEXT: vzeroupper
461 ; AVX512VBMI2-NEXT: retq
463 ; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32:
464 ; AVX512VLVBMI2: # %bb.0:
465 ; AVX512VLVBMI2-NEXT: vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
466 ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
467 ; AVX512VLVBMI2-NEXT: retq
469 ; XOPAVX1-LABEL: constant_funnnel_v2i32:
471 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
472 ; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
473 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
476 ; XOPAVX2-LABEL: constant_funnnel_v2i32:
478 ; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
479 ; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
480 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
483 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
485 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
486 ; X86-SSE2-NEXT: psrld $5, %xmm3
487 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
488 ; X86-SSE2-NEXT: psrld $4, %xmm2
489 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
490 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
491 ; X86-SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000
492 ; X86-SSE2-NEXT: movd %eax, %xmm1
493 ; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1
494 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
495 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
496 ; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
497 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
498 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
499 ; X86-SSE2-NEXT: por %xmm2, %xmm1
500 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
501 ; X86-SSE2-NEXT: retl
502 %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
507 ; Uniform Constant Shifts
510 define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
511 ; SSE-LABEL: splatconstant_funnnel_v2i32:
513 ; SSE-NEXT: psrld $4, %xmm1
514 ; SSE-NEXT: pslld $28, %xmm0
515 ; SSE-NEXT: por %xmm1, %xmm0
518 ; AVX-LABEL: splatconstant_funnnel_v2i32:
520 ; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
521 ; AVX-NEXT: vpslld $28, %xmm0, %xmm0
522 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
525 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32:
526 ; AVX512VBMI2: # %bb.0:
527 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
528 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
529 ; AVX512VBMI2-NEXT: vpshrdd $4, %zmm0, %zmm1, %zmm0
530 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
531 ; AVX512VBMI2-NEXT: vzeroupper
532 ; AVX512VBMI2-NEXT: retq
534 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32:
535 ; AVX512VLVBMI2: # %bb.0:
536 ; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
537 ; AVX512VLVBMI2-NEXT: retq
539 ; XOP-LABEL: splatconstant_funnnel_v2i32:
541 ; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
542 ; XOP-NEXT: vpslld $28, %xmm0, %xmm0
543 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
546 ; X86-SSE2-LABEL: splatconstant_funnnel_v2i32:
548 ; X86-SSE2-NEXT: psrld $4, %xmm1
549 ; X86-SSE2-NEXT: pslld $28, %xmm0
550 ; X86-SSE2-NEXT: por %xmm1, %xmm0
551 ; X86-SSE2-NEXT: retl
552 %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 4>)