1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 ; Verify that the following shifts are lowered into a sequence of two shifts plus
7 ; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
8 ; packed shift right by a constant build_vector the backend should always try to
9 ; emit a simpler sequence of two shifts + blend when possible.
11 define <8 x i16> @test1(<8 x i16> %a) {
14 ; SSE-NEXT: movdqa %xmm0, %xmm1
15 ; SSE-NEXT: psrlw $3, %xmm1
16 ; SSE-NEXT: psrlw $2, %xmm0
17 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
22 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
23 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
24 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
29 ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1
30 ; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
31 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
33 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
37 define <8 x i16> @test2(<8 x i16> %a) {
40 ; SSE-NEXT: movdqa %xmm0, %xmm1
41 ; SSE-NEXT: psrlw $3, %xmm1
42 ; SSE-NEXT: psrlw $2, %xmm0
43 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
48 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
49 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
50 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
55 ; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
56 ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
57 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
59 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
63 define <4 x i32> @test3(<4 x i32> %a) {
66 ; SSE-NEXT: movdqa %xmm0, %xmm1
67 ; SSE-NEXT: psrld $3, %xmm1
68 ; SSE-NEXT: psrld $2, %xmm0
69 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
74 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
75 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
76 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
81 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
83 %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
87 define <4 x i32> @test4(<4 x i32> %a) {
90 ; SSE-NEXT: movdqa %xmm0, %xmm1
91 ; SSE-NEXT: psrld $3, %xmm1
92 ; SSE-NEXT: psrld $2, %xmm0
93 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
98 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
99 ; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0
100 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
105 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
107 %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
111 define <8 x i16> @test5(<8 x i16> %a) {
114 ; SSE-NEXT: movdqa %xmm0, %xmm1
115 ; SSE-NEXT: psraw $3, %xmm1
116 ; SSE-NEXT: psraw $2, %xmm0
117 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
122 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
123 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0
124 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
129 ; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1
130 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0
131 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
133 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
137 define <8 x i16> @test6(<8 x i16> %a) {
140 ; SSE-NEXT: movdqa %xmm0, %xmm1
141 ; SSE-NEXT: psraw $3, %xmm1
142 ; SSE-NEXT: psraw $2, %xmm0
143 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
148 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
149 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
150 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
155 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1
156 ; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0
157 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
159 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
163 define <4 x i32> @test7(<4 x i32> %a) {
166 ; SSE-NEXT: movdqa %xmm0, %xmm1
167 ; SSE-NEXT: psrad $3, %xmm1
168 ; SSE-NEXT: psrad $2, %xmm0
169 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
174 ; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
175 ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
176 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
181 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
183 %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
187 define <4 x i32> @test8(<4 x i32> %a) {
190 ; SSE-NEXT: movdqa %xmm0, %xmm1
191 ; SSE-NEXT: psrad $3, %xmm1
192 ; SSE-NEXT: psrad $2, %xmm0
193 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
198 ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1
199 ; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0
200 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
205 ; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
207 %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
211 define <8 x i16> @test9(<8 x i16> %a) {
214 ; SSE-NEXT: movdqa %xmm0, %xmm1
215 ; SSE-NEXT: psraw $3, %xmm1
216 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
217 ; SSE-NEXT: psraw $1, %xmm0
218 ; SSE-NEXT: pand %xmm2, %xmm0
219 ; SSE-NEXT: pandn %xmm1, %xmm2
220 ; SSE-NEXT: por %xmm2, %xmm0
225 ; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
226 ; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
227 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
229 %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
233 define <8 x i32> @test10(<8 x i32>* %a) {
236 ; SSE-NEXT: movdqa (%rdi), %xmm0
237 ; SSE-NEXT: psrad $1, %xmm0
240 ; AVX1-LABEL: test10:
242 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
243 ; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0
246 ; AVX2-LABEL: test10:
248 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
249 ; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0
251 %ld = load <8 x i32>, <8 x i32>* %a, align 32
252 %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
256 ; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks).
258 define <16 x i16> @test11(<16 x i16> %a) {
261 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
262 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
265 ; AVX1-LABEL: test11:
267 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
268 ; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
269 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
270 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
271 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
272 ; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
273 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
274 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
277 ; AVX2-LABEL: test11:
279 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
281 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
285 define <16 x i16> @test12(<16 x i16> %a) {
288 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
289 ; SSE-NEXT: pmullw %xmm2, %xmm0
290 ; SSE-NEXT: pmullw %xmm2, %xmm1
293 ; AVX1-LABEL: test12:
295 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
296 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
297 ; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
298 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
299 ; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
300 ; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
301 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
302 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
305 ; AVX2-LABEL: test12:
307 ; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
308 ; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
309 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
311 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>