1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; Verify that we don't scalarize a packed vector shift left of 16-bit
8 ; signed integers if the amount is a constant build_vector.
9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
11 define <8 x i16> @test1(<8 x i16> %a) {
14 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
19 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
25 define <8 x i16> @test2(<8 x i16> %a) {
28 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
33 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
39 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
40 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
41 ; counts is a constant build_vector.
43 define <4 x i32> @test3(<4 x i32> %a) {
46 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
48 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49 ; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
56 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
61 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
67 define <4 x i32> @test4(<4 x i32> %a) {
70 ; SSE2-NEXT: movdqa %xmm0, %xmm1
71 ; SSE2-NEXT: pslld $1, %xmm1
72 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
73 ; SSE2-NEXT: movapd %xmm1, %xmm0
78 ; SSE41-NEXT: movdqa %xmm0, %xmm1
79 ; SSE41-NEXT: pslld $1, %xmm1
80 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
81 ; SSE41-NEXT: movdqa %xmm1, %xmm0
86 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
88 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
92 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
93 ; into two pmullw instructions. With AVX2, the test case below would produce
96 define <16 x i16> @test5(<16 x i16> %a) {
99 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
100 ; SSE-NEXT: pmullw %xmm2, %xmm0
101 ; SSE-NEXT: pmullw %xmm2, %xmm1
106 ; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
108 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
112 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
113 ; into two pmulld instructions. With AVX2, the test case below would produce
114 ; a single vpsllvd instead.
116 define <8 x i32> @test6(<8 x i32> %a) {
119 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
120 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
121 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
122 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
123 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
124 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
125 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
126 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
127 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
128 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
129 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
130 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
131 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
132 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
133 ; SSE2-NEXT: movdqa %xmm2, %xmm1
136 ; SSE41-LABEL: test6:
138 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
139 ; SSE41-NEXT: pmulld %xmm2, %xmm0
140 ; SSE41-NEXT: pmulld %xmm2, %xmm1
145 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
147 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
151 ; With AVX2 and AVX512, the test case below should produce a sequence of
152 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
153 ; parts and then we convert each part into a pmullw.
155 define <32 x i16> @test7(<32 x i16> %a) {
158 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
159 ; SSE-NEXT: pmullw %xmm4, %xmm0
160 ; SSE-NEXT: pmullw %xmm4, %xmm1
161 ; SSE-NEXT: pmullw %xmm4, %xmm2
162 ; SSE-NEXT: pmullw %xmm4, %xmm3
167 ; AVX-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
168 ; AVX-NEXT: # ymm2 = mem[0,1,0,1]
169 ; AVX-NEXT: vpmullw %ymm2, %ymm0, %ymm0
170 ; AVX-NEXT: vpmullw %ymm2, %ymm1, %ymm1
172 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
176 ; Similar to test7; the difference is that with AVX512 support
177 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
179 define <16 x i32> @test8(<16 x i32> %a) {
182 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
183 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
184 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
185 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
186 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
187 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
188 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
189 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
190 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
191 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
192 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
193 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
194 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
195 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
196 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
197 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
198 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
199 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
200 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
201 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
202 ; SSE2-NEXT: pmuludq %xmm3, %xmm4
203 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
204 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
205 ; SSE2-NEXT: pmuludq %xmm6, %xmm3
206 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
207 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
208 ; SSE2-NEXT: movdqa %xmm4, %xmm3
211 ; SSE41-LABEL: test8:
213 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
214 ; SSE41-NEXT: pmulld %xmm4, %xmm0
215 ; SSE41-NEXT: pmulld %xmm4, %xmm1
216 ; SSE41-NEXT: pmulld %xmm4, %xmm2
217 ; SSE41-NEXT: pmulld %xmm4, %xmm3
222 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
223 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
224 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
225 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
228 ; AVX512-LABEL: test8:
230 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
232 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
236 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
238 define <8 x i64> @test9(<8 x i64> %a) {
241 ; SSE2-NEXT: movdqa %xmm1, %xmm4
242 ; SSE2-NEXT: psllq $2, %xmm4
243 ; SSE2-NEXT: psllq $3, %xmm1
244 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
245 ; SSE2-NEXT: movdqa %xmm3, %xmm4
246 ; SSE2-NEXT: psllq $2, %xmm4
247 ; SSE2-NEXT: psllq $3, %xmm3
248 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
249 ; SSE2-NEXT: paddq %xmm0, %xmm0
250 ; SSE2-NEXT: paddq %xmm2, %xmm2
253 ; SSE41-LABEL: test9:
255 ; SSE41-NEXT: movdqa %xmm1, %xmm4
256 ; SSE41-NEXT: psllq $3, %xmm4
257 ; SSE41-NEXT: psllq $2, %xmm1
258 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
259 ; SSE41-NEXT: movdqa %xmm3, %xmm4
260 ; SSE41-NEXT: psllq $3, %xmm4
261 ; SSE41-NEXT: psllq $2, %xmm3
262 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
263 ; SSE41-NEXT: paddq %xmm0, %xmm0
264 ; SSE41-NEXT: paddq %xmm2, %xmm2
269 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
270 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
271 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
274 ; AVX512-LABEL: test9:
276 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
278 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>