1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
7 ; Verify that we don't scalarize a packed vector shift left of 16-bit
8 ; signed integers if the amount is a constant build_vector.
9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
11 define <8 x i16> @test1(<8 x i16> %a) {
14 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,4,8,128,1,512,2048]
19 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,2,4,8,128,1,512,2048]
21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
25 ; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw())
27 define <8 x i16> @test2(<8 x i16> %a) {
30 ; SSE2-NEXT: movdqa %xmm0, %xmm1
31 ; SSE2-NEXT: paddw %xmm0, %xmm1
32 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
37 ; SSE41-NEXT: movdqa %xmm0, %xmm1
38 ; SSE41-NEXT: paddw %xmm0, %xmm1
39 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
44 ; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm1
45 ; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
47 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
51 ; Verify that a vector shift left of 32-bit signed integers is simply expanded
52 ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
53 ; counts is a constant build_vector.
55 define <4 x i32> @test3(<4 x i32> %a) {
58 ; SSE2-NEXT: movdqa %xmm0, %xmm1
59 ; SSE2-NEXT: paddd %xmm0, %xmm1
60 ; SSE2-NEXT: pslld $2, %xmm0
61 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
66 ; SSE41-NEXT: movdqa %xmm0, %xmm1
67 ; SSE41-NEXT: pslld $2, %xmm1
68 ; SSE41-NEXT: paddd %xmm0, %xmm0
69 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
74 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
76 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
80 define <4 x i32> @test4(<4 x i32> %a) {
83 ; SSE2-NEXT: movdqa %xmm0, %xmm1
84 ; SSE2-NEXT: paddd %xmm0, %xmm1
85 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
90 ; SSE41-NEXT: movdqa %xmm0, %xmm1
91 ; SSE41-NEXT: paddd %xmm0, %xmm1
92 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
97 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
99 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
103 ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
104 ; into two pmullw instructions. With AVX2, the test case below would produce
107 define <16 x i16> @test5(<16 x i16> %a) {
110 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
111 ; SSE-NEXT: pmullw %xmm2, %xmm0
112 ; SSE-NEXT: pmullw %xmm2, %xmm1
117 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
119 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
123 ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
124 ; into two pmulld instructions. With AVX2, the test case below would produce
125 ; a single vpsllvd instead.
127 define <8 x i32> @test6(<8 x i32> %a) {
130 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
131 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
132 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
133 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
134 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,8,8]
135 ; SSE2-NEXT: pmuludq %xmm4, %xmm3
136 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
137 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
138 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
139 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
140 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
141 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
142 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
143 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
144 ; SSE2-NEXT: movdqa %xmm2, %xmm1
147 ; SSE41-LABEL: test6:
149 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8]
150 ; SSE41-NEXT: pmulld %xmm2, %xmm0
151 ; SSE41-NEXT: pmulld %xmm2, %xmm1
156 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
158 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
162 ; With AVX2 and AVX512, the test case below should produce a sequence of
163 ; two vpmullw instructions. On SSE2 instead, we split the shift in four
164 ; parts and then we convert each part into a pmullw.
166 define <32 x i16> @test7(<32 x i16> %a) {
169 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
170 ; SSE-NEXT: pmullw %xmm4, %xmm0
171 ; SSE-NEXT: pmullw %xmm4, %xmm1
172 ; SSE-NEXT: pmullw %xmm4, %xmm2
173 ; SSE-NEXT: pmullw %xmm4, %xmm3
178 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
179 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
180 ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
181 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
184 ; AVX512-LABEL: test7:
186 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
187 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
188 ; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
189 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
190 ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
191 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
193 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
197 ; Similar to test7; the difference is that with AVX512 support
198 ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
200 define <16 x i32> @test8(<16 x i32> %a) {
203 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
204 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
205 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
206 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
207 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2,2,8,8]
208 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
209 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
210 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
211 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
212 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
213 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
214 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
215 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
216 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
217 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
218 ; SSE2-NEXT: pmuludq %xmm4, %xmm2
219 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
220 ; SSE2-NEXT: pmuludq %xmm6, %xmm5
221 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
222 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
223 ; SSE2-NEXT: pmuludq %xmm3, %xmm4
224 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
225 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
226 ; SSE2-NEXT: pmuludq %xmm6, %xmm3
227 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
228 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
229 ; SSE2-NEXT: movdqa %xmm4, %xmm3
232 ; SSE41-LABEL: test8:
234 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8]
235 ; SSE41-NEXT: pmulld %xmm4, %xmm0
236 ; SSE41-NEXT: pmulld %xmm4, %xmm1
237 ; SSE41-NEXT: pmulld %xmm4, %xmm2
238 ; SSE41-NEXT: pmulld %xmm4, %xmm3
243 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
244 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
245 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
246 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
249 ; AVX512-LABEL: test8:
251 ; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
253 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
257 ; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
259 define <8 x i64> @test9(<8 x i64> %a) {
262 ; SSE2-NEXT: movdqa %xmm1, %xmm4
263 ; SSE2-NEXT: psllq $2, %xmm4
264 ; SSE2-NEXT: psllq $3, %xmm1
265 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
266 ; SSE2-NEXT: movdqa %xmm3, %xmm4
267 ; SSE2-NEXT: psllq $2, %xmm4
268 ; SSE2-NEXT: psllq $3, %xmm3
269 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
270 ; SSE2-NEXT: paddq %xmm0, %xmm0
271 ; SSE2-NEXT: paddq %xmm2, %xmm2
274 ; SSE41-LABEL: test9:
276 ; SSE41-NEXT: movdqa %xmm1, %xmm4
277 ; SSE41-NEXT: psllq $3, %xmm4
278 ; SSE41-NEXT: psllq $2, %xmm1
279 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
280 ; SSE41-NEXT: movdqa %xmm3, %xmm4
281 ; SSE41-NEXT: psllq $3, %xmm4
282 ; SSE41-NEXT: psllq $2, %xmm3
283 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
284 ; SSE41-NEXT: paddq %xmm0, %xmm0
285 ; SSE41-NEXT: paddq %xmm2, %xmm2
290 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3]
291 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
292 ; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
295 ; AVX512-LABEL: test9:
297 ; AVX512-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
299 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>