1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64
5 ; Verify that we correctly fold target specific packed vector shifts by
6 ; immediate count into a simple build_vector when the elements of the vector
7 ; in input to the packed shift are all constants or undef.
9 define <8 x i16> @test1() {
12 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
13 ; CHECK-NEXT: ret{{[l|q]}}
14 %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
18 define <8 x i16> @test2() {
21 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
22 ; CHECK-NEXT: ret{{[l|q]}}
23 %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
27 define <8 x i16> @test3() {
30 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
31 ; CHECK-NEXT: ret{{[l|q]}}
32 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
36 define <4 x i32> @test4() {
39 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
40 ; CHECK-NEXT: ret{{[l|q]}}
41 %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
45 define <4 x i32> @test5() {
48 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
49 ; CHECK-NEXT: ret{{[l|q]}}
50 %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
54 define <4 x i32> @test6() {
57 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
58 ; CHECK-NEXT: ret{{[l|q]}}
59 %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
63 define <2 x i64> @test7() {
66 ; X86-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0]
71 ; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16]
73 %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
77 define <2 x i64> @test8() {
80 ; X86-NEXT: movaps {{.*#+}} xmm0 = [1,0,2,0]
85 ; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2]
87 %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
91 define <8 x i16> @test9() {
94 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
95 ; CHECK-NEXT: ret{{[l|q]}}
96 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
100 define <4 x i32> @test10() {
101 ; CHECK-LABEL: test10:
103 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,0,4]
104 ; CHECK-NEXT: ret{{[l|q]}}
105 %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
109 define <2 x i64> @test11() {
112 ; X86-NEXT: movaps {{.*#+}} xmm0 = [0,0,3,0]
117 ; X64-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
119 %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
123 define <8 x i16> @test12() {
124 ; CHECK-LABEL: test12:
126 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
127 ; CHECK-NEXT: ret{{[l|q]}}
128 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
132 define <4 x i32> @test13() {
133 ; CHECK-LABEL: test13:
135 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,0,4]
136 ; CHECK-NEXT: ret{{[l|q]}}
137 %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
141 define <8 x i16> @test14() {
142 ; CHECK-LABEL: test14:
144 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
145 ; CHECK-NEXT: ret{{[l|q]}}
146 %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
150 define <4 x i32> @test15() {
151 ; CHECK-LABEL: test15:
153 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,64,0,256]
154 ; CHECK-NEXT: ret{{[l|q]}}
155 %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
159 define <2 x i64> @test16() {
162 ; X86-NEXT: movaps {{.*#+}} xmm0 = [0,0,248,0]
167 ; X64-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,248,0,0,0,0,0,0,0]
169 %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
173 ; Make sure we fold fully undef input vectors. We previously folded only when
174 ; undef had a single use so use 2 undefs.
175 define <4 x i32> @test17(<4 x i32> %a0, ptr %dummy) {
178 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
179 ; X86-NEXT: xorps %xmm0, %xmm0
180 ; X86-NEXT: movaps %xmm0, (%eax)
185 ; X64-NEXT: xorps %xmm0, %xmm0
186 ; X64-NEXT: movaps %xmm0, (%rdi)
188 %a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 6)
189 store <4 x i32> %a, ptr %dummy
190 %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 7)
194 define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) {
197 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
198 ; X86-NEXT: xorps %xmm0, %xmm0
199 ; X86-NEXT: movaps %xmm0, (%eax)
204 ; X64-NEXT: xorps %xmm0, %xmm0
205 ; X64-NEXT: movaps %xmm0, (%rdi)
207 %a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 3)
208 store <4 x i32> %a, ptr %dummy
209 %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 1)
215 define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){
216 ; CHECK-LABEL: extelt0_sub_pslli_v4i32:
218 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
219 ; CHECK-NEXT: psubd %xmm1, %xmm2
220 ; CHECK-NEXT: pxor %xmm1, %xmm1
221 ; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
222 ; CHECK-NEXT: pslld %xmm1, %xmm0
223 ; CHECK-NEXT: ret{{[l|q]}}
224 %ext = extractelement <4 x i32> %y, i64 0
225 %bo = sub i32 32, %ext
226 %r = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 %bo)
230 define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){
231 ; X86-LABEL: extelt1_add_psrli_v4i32:
233 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
234 ; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
235 ; X86-NEXT: xorps %xmm2, %xmm2
236 ; X86-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
237 ; X86-NEXT: psrld %xmm2, %xmm0
240 ; X64-LABEL: extelt1_add_psrli_v4i32:
242 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
243 ; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
244 ; X64-NEXT: xorps %xmm2, %xmm2
245 ; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
246 ; X64-NEXT: psrld %xmm2, %xmm0
248 %ext = extractelement <4 x i32> %y, i64 1
249 %bo = add i32 %ext, 3
250 %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo)
254 define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){
255 ; CHECK-LABEL: extelt1_add_psrai_v4i32_uses:
257 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
258 ; CHECK-NEXT: movd %xmm1, %ecx
259 ; CHECK-NEXT: addl $3, %ecx
260 ; CHECK-NEXT: movd %ecx, %xmm1
261 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
262 ; CHECK-NEXT: psrad %xmm1, %xmm0
263 ; CHECK-NEXT: movd %xmm0, %eax
264 ; CHECK-NEXT: imull %ecx, %eax
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %ext = extractelement <4 x i32> %y, i64 1
267 %bo = add i32 %ext, 3
268 %r = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 %bo)
269 %ext3 = extractelement <4 x i32> %r, i64 3
270 %r2 = mul i32 %bo, %ext3
274 define <4 x i32> @extelt0_twice_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z){
275 ; CHECK-LABEL: extelt0_twice_sub_pslli_v4i32:
277 ; CHECK-NEXT: movd %xmm1, %eax
278 ; CHECK-NEXT: movd %xmm2, %ecx
279 ; CHECK-NEXT: subl %ecx, %eax
280 ; CHECK-NEXT: movd %eax, %xmm1
281 ; CHECK-NEXT: pslld %xmm1, %xmm0
282 ; CHECK-NEXT: ret{{[l|q]}}
283 %ext1 = extractelement <4 x i32> %y, i64 0
284 %ext2 = extractelement <4 x i32> %z, i64 0
285 %bo = sub i32 %ext1, %ext2
286 %r = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 %bo)
290 ; This would crash because the scalar shift amount has a different type than the shift result.
292 define <2 x i8> @PR58661(<2 x i8> %a0) {
293 ; CHECK-LABEL: PR58661:
295 ; CHECK-NEXT: psrlw $8, %xmm0
296 ; CHECK-NEXT: movd %xmm0, %eax
297 ; CHECK-NEXT: shll $8, %eax
298 ; CHECK-NEXT: movd %eax, %xmm0
299 ; CHECK-NEXT: ret{{[l|q]}}
300 %shuffle = shufflevector <2 x i8> %a0, <2 x i8> <i8 poison, i8 0>, <2 x i32> <i32 1, i32 3>
301 %x = bitcast <2 x i8> %shuffle to i16
302 %shl = shl nuw i16 %x, 8
303 %y = bitcast i16 %shl to <2 x i8>
307 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
308 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
309 declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
310 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
311 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
312 declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
313 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
314 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)