1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; Verify that we don't scalarize a packed vector shift left of 16-bit
8 ; signed integers if the amount is a constant build_vector.
9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
11 define <8 x i16> @test1(<8 x i16> %a) {
14 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
19 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
25 define <8 x i16> @test2(<8 x i16> %a) {
28 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
33 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
39 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
40 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
41 ; counts is a constant build_vector.
43 define <4 x i32> @test3(<4 x i32> %a) {
46 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
48 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49 ; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
56 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
61 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
67 define <4 x i32> @test4(<4 x i32> %a) {
70 ; SSE2-NEXT: movdqa %xmm0, %xmm1
71 ; SSE2-NEXT: pslld $1, %xmm1
72 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
73 ; SSE2-NEXT: movapd %xmm1, %xmm0
78 ; SSE41-NEXT: movdqa %xmm0, %xmm1
79 ; SSE41-NEXT: pslld $1, %xmm1
80 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
81 ; SSE41-NEXT: movdqa %xmm1, %xmm0
86 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
88 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
92 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
93 ; into two pmullw instructions. With AVX2, the test case below would produce
96 define <16 x i16> @test5(<16 x i16> %a) {
99 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
100 ; SSE-NEXT: pmullw %xmm2, %xmm0
101 ; SSE-NEXT: pmullw %xmm2, %xmm1
106 ; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
108 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
112 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
113 ; into two pmulld instructions. With AVX2, the test case below would produce
114 ; a single vpsllvd instead.
116 define <8 x i32> @test6(<8 x i32> %a) {
119 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
120 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
121 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
122 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
123 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
124 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
125 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
126 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
127 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
128 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
129 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
130 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
131 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
132 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
133 ; SSE2-NEXT: movdqa %xmm2, %xmm1
136 ; SSE41-LABEL: test6:
138 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
139 ; SSE41-NEXT: pmulld %xmm2, %xmm0
140 ; SSE41-NEXT: pmulld %xmm2, %xmm1
145 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
147 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
151 ; With AVX2 and AVX512, the test case below should produce a sequence of
152 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
153 ; parts and then we convert each part into a pmullw.
155 define <32 x i16> @test7(<32 x i16> %a) {
158 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
159 ; SSE-NEXT: pmullw %xmm4, %xmm0
160 ; SSE-NEXT: pmullw %xmm4, %xmm1
161 ; SSE-NEXT: pmullw %xmm4, %xmm2
162 ; SSE-NEXT: pmullw %xmm4, %xmm3
167 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
168 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
169 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
170 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
173 ; AVX512-LABEL: test7:
175 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
176 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
177 ; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
178 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
179 ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
180 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
182 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
186 ; Similar to test7; the difference is that with AVX512 support
187 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
189 define <16 x i32> @test8(<16 x i32> %a) {
192 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
193 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
194 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
195 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
196 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
197 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
198 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
199 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
200 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
201 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
202 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
203 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
204 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
205 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
206 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
207 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
208 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
209 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
210 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
211 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
212 ; SSE2-NEXT: pmuludq %xmm3, %xmm4
213 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
214 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
215 ; SSE2-NEXT: pmuludq %xmm6, %xmm3
216 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
217 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
218 ; SSE2-NEXT: movdqa %xmm4, %xmm3
221 ; SSE41-LABEL: test8:
223 ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
224 ; SSE41-NEXT: pmulld %xmm4, %xmm0
225 ; SSE41-NEXT: pmulld %xmm4, %xmm1
226 ; SSE41-NEXT: pmulld %xmm4, %xmm2
227 ; SSE41-NEXT: pmulld %xmm4, %xmm3
232 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
233 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
234 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
235 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
238 ; AVX512-LABEL: test8:
240 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
242 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
246 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
248 define <8 x i64> @test9(<8 x i64> %a) {
251 ; SSE2-NEXT: movdqa %xmm1, %xmm4
252 ; SSE2-NEXT: psllq $2, %xmm4
253 ; SSE2-NEXT: psllq $3, %xmm1
254 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
255 ; SSE2-NEXT: movdqa %xmm3, %xmm4
256 ; SSE2-NEXT: psllq $2, %xmm4
257 ; SSE2-NEXT: psllq $3, %xmm3
258 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
259 ; SSE2-NEXT: paddq %xmm0, %xmm0
260 ; SSE2-NEXT: paddq %xmm2, %xmm2
263 ; SSE41-LABEL: test9:
265 ; SSE41-NEXT: movdqa %xmm1, %xmm4
266 ; SSE41-NEXT: psllq $3, %xmm4
267 ; SSE41-NEXT: psllq $2, %xmm1
268 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
269 ; SSE41-NEXT: movdqa %xmm3, %xmm4
270 ; SSE41-NEXT: psllq $3, %xmm4
271 ; SSE41-NEXT: psllq $2, %xmm3
272 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
273 ; SSE41-NEXT: paddq %xmm0, %xmm0
274 ; SSE41-NEXT: paddq %xmm2, %xmm2
279 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
280 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
281 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
284 ; AVX512-LABEL: test9:
286 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
288 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>