1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE
8 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
9 ; SSE-LABEL: combine_vec_ashr_zero:
11 ; SSE-NEXT: xorps %xmm0, %xmm0
14 ; AVX-LABEL: combine_vec_ashr_zero:
16 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
18 %1 = ashr <4 x i32> zeroinitializer, %x
22 ; fold (sra -1, x) -> -1
23 define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) {
24 ; SSE-LABEL: combine_vec_ashr_allones:
26 ; SSE-NEXT: pcmpeqd %xmm0, %xmm0
29 ; AVX-LABEL: combine_vec_ashr_allones:
31 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
33 %1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x
37 ; fold (sra x, c >= size(x)) -> undef
38 define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) {
39 ; CHECK-LABEL: combine_vec_ashr_outofrange0:
42 %1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
46 define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
47 ; CHECK-LABEL: combine_vec_ashr_outofrange1:
50 %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
54 define <4 x i32> @combine_vec_ashr_outofrange2(<4 x i32> %x) {
55 ; CHECK-LABEL: combine_vec_ashr_outofrange2:
58 %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
62 ; fold (sra x, 0) -> x
63 define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
64 ; CHECK-LABEL: combine_vec_ashr_by_zero:
67 %1 = ashr <4 x i32> %x, zeroinitializer
71 ; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
72 define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
73 ; SSE-LABEL: combine_vec_ashr_ashr0:
75 ; SSE-NEXT: psrad $6, %xmm0
78 ; AVX-LABEL: combine_vec_ashr_ashr0:
80 ; AVX-NEXT: vpsrad $6, %xmm0, %xmm0
82 %1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
83 %2 = ashr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
87 define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
88 ; SSE-LABEL: combine_vec_ashr_ashr1:
90 ; SSE-NEXT: movdqa %xmm0, %xmm1
91 ; SSE-NEXT: psrad $10, %xmm1
92 ; SSE-NEXT: movdqa %xmm0, %xmm2
93 ; SSE-NEXT: psrad $6, %xmm2
94 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
95 ; SSE-NEXT: movdqa %xmm0, %xmm1
96 ; SSE-NEXT: psrad $8, %xmm1
97 ; SSE-NEXT: psrad $4, %xmm0
98 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
99 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
102 ; AVX-LABEL: combine_vec_ashr_ashr1:
104 ; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
106 %1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
107 %2 = ashr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
111 define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
112 ; SSE-LABEL: combine_vec_ashr_ashr2:
114 ; SSE-NEXT: psrad $31, %xmm0
117 ; AVX-LABEL: combine_vec_ashr_ashr2:
119 ; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
121 %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
122 %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
126 define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
127 ; SSE-LABEL: combine_vec_ashr_ashr3:
129 ; SSE-NEXT: movdqa %xmm0, %xmm1
130 ; SSE-NEXT: psrad $27, %xmm1
131 ; SSE-NEXT: movdqa %xmm0, %xmm2
132 ; SSE-NEXT: psrad $15, %xmm2
133 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
134 ; SSE-NEXT: psrad $31, %xmm0
135 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
138 ; AVX-LABEL: combine_vec_ashr_ashr3:
140 ; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
142 %1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27>
143 %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0>
147 ; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
148 define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
149 ; SSE-LABEL: combine_vec_ashr_trunc_and:
151 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
152 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
153 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
154 ; SSE-NEXT: movdqa %xmm0, %xmm3
155 ; SSE-NEXT: psrad %xmm2, %xmm3
156 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
157 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
158 ; SSE-NEXT: movdqa %xmm0, %xmm5
159 ; SSE-NEXT: psrad %xmm4, %xmm5
160 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
161 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
162 ; SSE-NEXT: movdqa %xmm0, %xmm3
163 ; SSE-NEXT: psrad %xmm1, %xmm3
164 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
165 ; SSE-NEXT: psrad %xmm1, %xmm0
166 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
167 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
170 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
171 ; AVX2-SLOW: # %bb.0:
172 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
173 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
174 ; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
175 ; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
176 ; AVX2-SLOW-NEXT: vzeroupper
177 ; AVX2-SLOW-NEXT: retq
179 ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and:
180 ; AVX2-FAST-ALL: # %bb.0:
181 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
182 ; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
183 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
184 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
185 ; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
186 ; AVX2-FAST-ALL-NEXT: vzeroupper
187 ; AVX2-FAST-ALL-NEXT: retq
189 ; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_and:
190 ; AVX2-FAST-PERLANE: # %bb.0:
191 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
192 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
193 ; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
194 ; AVX2-FAST-PERLANE-NEXT: vpsravd %xmm1, %xmm0, %xmm0
195 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
196 ; AVX2-FAST-PERLANE-NEXT: retq
197 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
198 %2 = trunc <4 x i64> %1 to <4 x i32>
199 %3 = ashr <4 x i32> %x, %2
203 ; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
204 ; if c1 is equal to the number of bits the trunc removes
205 define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
206 ; SSE-LABEL: combine_vec_ashr_trunc_lshr:
208 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
209 ; SSE-NEXT: movaps %xmm0, %xmm2
210 ; SSE-NEXT: psrad $2, %xmm2
211 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
212 ; SSE-NEXT: psrad $1, %xmm0
213 ; SSE-NEXT: psrad $3, %xmm1
214 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
215 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
218 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
219 ; AVX2-SLOW: # %bb.0:
220 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
221 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
222 ; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
223 ; AVX2-SLOW-NEXT: vzeroupper
224 ; AVX2-SLOW-NEXT: retq
226 ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr:
227 ; AVX2-FAST-ALL: # %bb.0:
228 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7]
229 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
230 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
231 ; AVX2-FAST-ALL-NEXT: vzeroupper
232 ; AVX2-FAST-ALL-NEXT: retq
234 ; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr:
235 ; AVX2-FAST-PERLANE: # %bb.0:
236 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
237 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
238 ; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
239 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
240 ; AVX2-FAST-PERLANE-NEXT: retq
241 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
242 %2 = trunc <4 x i64> %1 to <4 x i32>
243 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
247 define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
248 ; SSE-LABEL: combine_vec_ashr_trunc_lshr_splat:
250 ; SSE-NEXT: psrad $26, %xmm3
251 ; SSE-NEXT: psrad $26, %xmm2
252 ; SSE-NEXT: packssdw %xmm3, %xmm2
253 ; SSE-NEXT: psrad $26, %xmm1
254 ; SSE-NEXT: psrad $26, %xmm0
255 ; SSE-NEXT: packssdw %xmm1, %xmm0
256 ; SSE-NEXT: packsswb %xmm2, %xmm0
259 ; AVX-LABEL: combine_vec_ashr_trunc_lshr_splat:
261 ; AVX-NEXT: vpsrad $26, %ymm1, %ymm1
262 ; AVX-NEXT: vpsrad $26, %ymm0, %ymm0
263 ; AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
264 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
265 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
266 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
267 ; AVX-NEXT: vzeroupper
269 %1 = lshr <16 x i32> %x, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
270 %2 = trunc <16 x i32> %1 to <16 x i8>
271 %3 = ashr <16 x i8> %2, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
275 ; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
276 ; if c1 is equal to the number of bits the trunc removes
277 define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
278 ; SSE-LABEL: combine_vec_ashr_trunc_ashr:
280 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
281 ; SSE-NEXT: movaps %xmm0, %xmm2
282 ; SSE-NEXT: psrad $2, %xmm2
283 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
284 ; SSE-NEXT: psrad $1, %xmm0
285 ; SSE-NEXT: psrad $3, %xmm1
286 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
287 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
290 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
291 ; AVX2-SLOW: # %bb.0:
292 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
293 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
294 ; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
295 ; AVX2-SLOW-NEXT: vzeroupper
296 ; AVX2-SLOW-NEXT: retq
298 ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr:
299 ; AVX2-FAST-ALL: # %bb.0:
300 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7]
301 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
302 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
303 ; AVX2-FAST-ALL-NEXT: vzeroupper
304 ; AVX2-FAST-ALL-NEXT: retq
306 ; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr:
307 ; AVX2-FAST-PERLANE: # %bb.0:
308 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
309 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
310 ; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
311 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
312 ; AVX2-FAST-PERLANE-NEXT: retq
313 %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
314 %2 = trunc <4 x i64> %1 to <4 x i32>
315 %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
319 define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
320 ; SSE-LABEL: combine_vec_ashr_trunc_ashr_splat:
322 ; SSE-NEXT: psrad $19, %xmm1
323 ; SSE-NEXT: psrad $19, %xmm0
324 ; SSE-NEXT: packssdw %xmm1, %xmm0
327 ; AVX-LABEL: combine_vec_ashr_trunc_ashr_splat:
329 ; AVX-NEXT: vpsrad $19, %ymm0, %ymm0
330 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
331 ; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
332 ; AVX-NEXT: vzeroupper
334 %1 = ashr <8 x i32> %x, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
335 %2 = trunc <8 x i32> %1 to <8 x i16>
336 %3 = ashr <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
340 ; If the sign bit is known to be zero, switch this to a SRL.
341 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
342 ; SSE-LABEL: combine_vec_ashr_positive:
344 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
345 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
346 ; SSE-NEXT: movdqa %xmm0, %xmm3
347 ; SSE-NEXT: psrld %xmm2, %xmm3
348 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
349 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
350 ; SSE-NEXT: movdqa %xmm0, %xmm5
351 ; SSE-NEXT: psrld %xmm4, %xmm5
352 ; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
353 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
354 ; SSE-NEXT: movdqa %xmm0, %xmm3
355 ; SSE-NEXT: psrld %xmm1, %xmm3
356 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
357 ; SSE-NEXT: psrld %xmm1, %xmm0
358 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
359 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
362 ; AVX-LABEL: combine_vec_ashr_positive:
364 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
365 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
367 %1 = and <4 x i32> %x, <i32 15, i32 255, i32 4095, i32 65535>
368 %2 = ashr <4 x i32> %1, %y
372 define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
373 ; SSE-LABEL: combine_vec_ashr_positive_splat:
375 ; SSE-NEXT: xorps %xmm0, %xmm0
378 ; AVX-LABEL: combine_vec_ashr_positive_splat:
380 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
382 %1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023>
383 %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>