1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; Verify that we don't scalarize a packed vector shift left of 16-bit
8 ; signed integers if the amount is a constant build_vector.
9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
11 define <8 x i16> @test1(<8 x i16> %a) {
14 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,4,8,128,1,512,2048]
19 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,2,4,8,128,1,512,2048]
21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
25 define <8 x i16> @test2(<8 x i16> %a) {
28 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,1,1,2,u,u,2]
33 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,1,1,2,u,u,2]
35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
39 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
40 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
41 ; counts is a constant build_vector.
43 define <4 x i32> @test3(<4 x i32> %a) {
46 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
48 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
50 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
56 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
61 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
67 define <4 x i32> @test4(<4 x i32> %a) {
70 ; SSE2-NEXT: movdqa %xmm0, %xmm1
71 ; SSE2-NEXT: pslld $1, %xmm1
72 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
77 ; SSE41-NEXT: movdqa %xmm0, %xmm1
78 ; SSE41-NEXT: pslld $1, %xmm1
79 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
84 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
86 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
90 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
91 ; into two pmullw instructions. With AVX2, the test case below would produce
94 define <16 x i16> @test5(<16 x i16> %a) {
97 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
98 ; SSE-NEXT: pmullw %xmm2, %xmm0
99 ; SSE-NEXT: pmullw %xmm2, %xmm1
104 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
106 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
110 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
111 ; into two pmulld instructions. With AVX2, the test case below would produce
112 ; a single vpsllvd instead.
114 define <8 x i32> @test6(<8 x i32> %a) {
117 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
118 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
119 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
120 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
121 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,8,8]
122 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
123 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
124 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
125 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
126 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
127 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
128 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
129 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
130 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
131 ; SSE2-NEXT: movdqa %xmm2, %xmm1
134 ; SSE41-LABEL: test6:
136 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8]
137 ; SSE41-NEXT: pmulld %xmm2, %xmm0
138 ; SSE41-NEXT: pmulld %xmm2, %xmm1
143 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
145 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
149 ; With AVX2 and AVX512, the test case below should produce a sequence of
150 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
151 ; parts and then we convert each part into a pmullw.
153 define <32 x i16> @test7(<32 x i16> %a) {
156 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
157 ; SSE-NEXT: pmullw %xmm4, %xmm0
158 ; SSE-NEXT: pmullw %xmm4, %xmm1
159 ; SSE-NEXT: pmullw %xmm4, %xmm2
160 ; SSE-NEXT: pmullw %xmm4, %xmm3
165 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
166 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
167 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
168 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
171 ; AVX512-LABEL: test7:
173 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
174 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
175 ; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
176 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
177 ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
178 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
180 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
184 ; Similar to test7; the difference is that with AVX512 support
185 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
187 define <16 x i32> @test8(<16 x i32> %a) {
190 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
191 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
192 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
193 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
194 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2,2,8,8]
195 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
196 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
197 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
198 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
199 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
200 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
201 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
202 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
203 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
204 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
205 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
206 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
207 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
208 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
209 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
210 ; SSE2-NEXT: pmuludq %xmm3, %xmm4
211 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
212 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
213 ; SSE2-NEXT: pmuludq %xmm6, %xmm3
214 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
215 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
216 ; SSE2-NEXT: movdqa %xmm4, %xmm3
219 ; SSE41-LABEL: test8:
221 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8]
222 ; SSE41-NEXT: pmulld %xmm4, %xmm0
223 ; SSE41-NEXT: pmulld %xmm4, %xmm1
224 ; SSE41-NEXT: pmulld %xmm4, %xmm2
225 ; SSE41-NEXT: pmulld %xmm4, %xmm3
230 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
231 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
232 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
233 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
236 ; AVX512-LABEL: test8:
238 ; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
240 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
244 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
246 define <8 x i64> @test9(<8 x i64> %a) {
249 ; SSE2-NEXT: movdqa %xmm1, %xmm4
250 ; SSE2-NEXT: psllq $2, %xmm4
251 ; SSE2-NEXT: psllq $3, %xmm1
252 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
253 ; SSE2-NEXT: movdqa %xmm3, %xmm4
254 ; SSE2-NEXT: psllq $2, %xmm4
255 ; SSE2-NEXT: psllq $3, %xmm3
256 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
257 ; SSE2-NEXT: paddq %xmm0, %xmm0
258 ; SSE2-NEXT: paddq %xmm2, %xmm2
261 ; SSE41-LABEL: test9:
263 ; SSE41-NEXT: movdqa %xmm1, %xmm4
264 ; SSE41-NEXT: psllq $3, %xmm4
265 ; SSE41-NEXT: psllq $2, %xmm1
266 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
267 ; SSE41-NEXT: movdqa %xmm3, %xmm4
268 ; SSE41-NEXT: psllq $3, %xmm4
269 ; SSE41-NEXT: psllq $2, %xmm3
270 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
271 ; SSE41-NEXT: paddq %xmm0, %xmm0
272 ; SSE41-NEXT: paddq %xmm2, %xmm2
277 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3]
278 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
279 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
282 ; AVX512-LABEL: test9:
284 ; AVX512-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
286 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>