1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE
9 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
10 ; SSE-LABEL: combine_vec_shl_zero:
12 ; SSE-NEXT: xorps %xmm0, %xmm0
15 ; AVX-LABEL: combine_vec_shl_zero:
17 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
19 %1 = shl <4 x i32> zeroinitializer, %x
23 ; fold (shl x, c >= size(x)) -> undef
24 define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
25 ; CHECK-LABEL: combine_vec_shl_outofrange0:
28 %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
32 define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
33 ; CHECK-LABEL: combine_vec_shl_outofrange1:
36 %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
40 define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
41 ; CHECK-LABEL: combine_vec_shl_outofrange2:
44 %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
45 %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
49 define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) {
50 ; CHECK-LABEL: combine_vec_shl_outofrange3:
53 %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef>
57 ; fold (shl x, 0) -> x
58 define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
59 ; CHECK-LABEL: combine_vec_shl_by_zero:
62 %1 = shl <4 x i32> %x, zeroinitializer
66 ; if (shl x, c) is known to be zero, return 0
67 define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
68 ; SSE-LABEL: combine_vec_shl_known_zero0:
70 ; SSE-NEXT: xorps %xmm0, %xmm0
73 ; AVX-LABEL: combine_vec_shl_known_zero0:
75 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
77 %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
78 %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
82 define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
83 ; SSE2-LABEL: combine_vec_shl_known_zero1:
85 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
86 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192]
87 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
88 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
89 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
91 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
92 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
93 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
96 ; SSE41-LABEL: combine_vec_shl_known_zero1:
98 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
99 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
102 ; AVX-LABEL: combine_vec_shl_known_zero1:
104 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
105 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
107 %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080>
108 %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13>
112 ; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
113 define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
114 ; SSE2-LABEL: combine_vec_shl_trunc_and:
116 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
117 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
118 ; SSE2-NEXT: pslld $23, %xmm1
119 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
120 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
121 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
122 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
123 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
124 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
125 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
126 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
127 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
130 ; SSE41-LABEL: combine_vec_shl_trunc_and:
132 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
133 ; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
134 ; SSE41-NEXT: pslld $23, %xmm1
135 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
136 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
137 ; SSE41-NEXT: pmulld %xmm1, %xmm0
140 ; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
142 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
143 ; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
144 ; AVX-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
145 ; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
146 ; AVX-SLOW-NEXT: vzeroupper
147 ; AVX-SLOW-NEXT: retq
149 ; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
150 ; AVX-FAST-ALL: # %bb.0:
151 ; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
152 ; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
153 ; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
154 ; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
155 ; AVX-FAST-ALL-NEXT: vzeroupper
156 ; AVX-FAST-ALL-NEXT: retq
158 ; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
159 ; AVX-FAST-PERLANE: # %bb.0:
160 ; AVX-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
161 ; AVX-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
162 ; AVX-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
163 ; AVX-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
164 ; AVX-FAST-PERLANE-NEXT: vzeroupper
165 ; AVX-FAST-PERLANE-NEXT: retq
166 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
167 %2 = trunc <4 x i64> %1 to <4 x i32>
168 %3 = shl <4 x i32> %x, %2
172 ; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
173 define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
174 ; SSE-LABEL: combine_vec_shl_shl0:
176 ; SSE-NEXT: pslld $6, %xmm0
179 ; AVX-LABEL: combine_vec_shl_shl0:
181 ; AVX-NEXT: vpslld $6, %xmm0, %xmm0
183 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
184 %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
188 define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
189 ; SSE2-LABEL: combine_vec_shl_shl1:
191 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,64,256,1024]
192 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
193 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
194 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
195 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
196 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
197 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
198 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
201 ; SSE41-LABEL: combine_vec_shl_shl1:
203 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
206 ; AVX-LABEL: combine_vec_shl_shl1:
208 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
210 %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
211 %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
215 ; fold (shl (shl x, c1), c2) -> 0
216 define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
217 ; SSE-LABEL: combine_vec_shl_shlr_zero0:
219 ; SSE-NEXT: xorps %xmm0, %xmm0
222 ; AVX-LABEL: combine_vec_shl_shlr_zero0:
224 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
226 %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
227 %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
231 define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
232 ; SSE-LABEL: combine_vec_shl_shl_zero1:
234 ; SSE-NEXT: xorps %xmm0, %xmm0
237 ; AVX-LABEL: combine_vec_shl_shl_zero1:
239 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
241 %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
242 %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
246 ; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
247 define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
248 ; SSE2-LABEL: combine_vec_shl_ext_shl0:
250 ; SSE2-NEXT: movdqa %xmm0, %xmm1
251 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
252 ; SSE2-NEXT: pslld $20, %xmm0
253 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
254 ; SSE2-NEXT: pslld $20, %xmm1
257 ; SSE41-LABEL: combine_vec_shl_ext_shl0:
259 ; SSE41-NEXT: movdqa %xmm0, %xmm1
260 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
261 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
262 ; SSE41-NEXT: pslld $20, %xmm1
263 ; SSE41-NEXT: pslld $20, %xmm0
266 ; AVX-LABEL: combine_vec_shl_ext_shl0:
268 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
269 ; AVX-NEXT: vpslld $20, %ymm0, %ymm0
271 %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
272 %2 = sext <8 x i16> %1 to <8 x i32>
273 %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
277 define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
278 ; SSE-LABEL: combine_vec_shl_ext_shl1:
280 ; SSE-NEXT: xorps %xmm0, %xmm0
281 ; SSE-NEXT: xorps %xmm1, %xmm1
284 ; AVX-LABEL: combine_vec_shl_ext_shl1:
286 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
288 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
289 %2 = sext <8 x i16> %1 to <8 x i32>
290 %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28>
294 define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
295 ; SSE2-LABEL: combine_vec_shl_ext_shl2:
297 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
298 ; SSE2-NEXT: psrad $16, %xmm1
299 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [131072,524288,2097152,8388608]
300 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
301 ; SSE2-NEXT: pmuludq %xmm3, %xmm1
302 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
303 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
304 ; SSE2-NEXT: pmuludq %xmm4, %xmm1
305 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
306 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
307 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
308 ; SSE2-NEXT: psrad $16, %xmm0
309 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33554432,134217728,536870912,2147483648]
310 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
311 ; SSE2-NEXT: pmuludq %xmm3, %xmm0
312 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
313 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
314 ; SSE2-NEXT: pmuludq %xmm4, %xmm0
315 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
316 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
317 ; SSE2-NEXT: movdqa %xmm2, %xmm0
320 ; SSE41-LABEL: combine_vec_shl_ext_shl2:
322 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
323 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
324 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
325 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
326 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
327 ; SSE41-NEXT: movdqa %xmm2, %xmm0
330 ; AVX-LABEL: combine_vec_shl_ext_shl2:
332 ; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
333 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
335 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
336 %2 = sext <8 x i16> %1 to <8 x i32>
337 %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
341 ; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
342 define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
343 ; SSE2-LABEL: combine_vec_shl_zext_lshr0:
345 ; SSE2-NEXT: movdqa %xmm0, %xmm1
346 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
347 ; SSE2-NEXT: pxor %xmm2, %xmm2
348 ; SSE2-NEXT: movdqa %xmm1, %xmm0
349 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
350 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
353 ; SSE41-LABEL: combine_vec_shl_zext_lshr0:
355 ; SSE41-NEXT: movdqa %xmm0, %xmm1
356 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
357 ; SSE41-NEXT: pxor %xmm2, %xmm2
358 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
359 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
362 ; AVX-LABEL: combine_vec_shl_zext_lshr0:
364 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
365 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
367 %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
368 %2 = zext <8 x i16> %1 to <8 x i32>
369 %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
373 define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
374 ; SSE2-LABEL: combine_vec_shl_zext_lshr1:
376 ; SSE2-NEXT: movdqa %xmm0, %xmm1
377 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
378 ; SSE2-NEXT: pxor %xmm2, %xmm2
379 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
380 ; SSE2-NEXT: movdqa %xmm1, %xmm0
381 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
382 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
385 ; SSE41-LABEL: combine_vec_shl_zext_lshr1:
387 ; SSE41-NEXT: movdqa %xmm0, %xmm1
388 ; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
389 ; SSE41-NEXT: pxor %xmm2, %xmm2
390 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
391 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
392 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
395 ; AVX-LABEL: combine_vec_shl_zext_lshr1:
397 ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
398 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
399 ; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
401 %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
402 %2 = zext <8 x i16> %1 to <8 x i32>
403 %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
407 ; fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
408 define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
409 ; SSE-LABEL: combine_vec_shl_ge_ashr_extact0:
411 ; SSE-NEXT: pslld $2, %xmm0
414 ; AVX-LABEL: combine_vec_shl_ge_ashr_extact0:
416 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0
418 %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
419 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
423 define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
424 ; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
426 ; SSE2-NEXT: movdqa %xmm0, %xmm1
427 ; SSE2-NEXT: psrad $3, %xmm1
428 ; SSE2-NEXT: movdqa %xmm0, %xmm2
429 ; SSE2-NEXT: psrad $5, %xmm2
430 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
431 ; SSE2-NEXT: movdqa %xmm0, %xmm1
432 ; SSE2-NEXT: psrad $8, %xmm1
433 ; SSE2-NEXT: psrad $4, %xmm0
434 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
435 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256]
436 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
437 ; SSE2-NEXT: pmuludq %xmm0, %xmm3
438 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
439 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
440 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
441 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
444 ; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
446 ; SSE41-NEXT: movdqa %xmm0, %xmm1
447 ; SSE41-NEXT: psrad $8, %xmm1
448 ; SSE41-NEXT: movdqa %xmm0, %xmm2
449 ; SSE41-NEXT: psrad $4, %xmm2
450 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
451 ; SSE41-NEXT: movdqa %xmm0, %xmm1
452 ; SSE41-NEXT: psrad $5, %xmm1
453 ; SSE41-NEXT: psrad $3, %xmm0
454 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
455 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
456 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
459 ; AVX-LABEL: combine_vec_shl_ge_ashr_extact1:
461 ; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
462 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
464 %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
465 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
469 ; fold (shl (sr[la] exact SEL(X,Y), C1), C2) -> (shl SEL(X,Y), (C2-C1)) if C1 <= C2
470 define i32 @combine_shl_ge_sel_ashr_extact0(i32 %x, i32 %y, i32 %z) {
471 ; CHECK-LABEL: combine_shl_ge_sel_ashr_extact0:
473 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
474 ; CHECK-NEXT: testl %edx, %edx
475 ; CHECK-NEXT: cmovel %esi, %edi
476 ; CHECK-NEXT: leal (,%rdi,4), %eax
478 %cmp = icmp ne i32 %z, 0
479 %ashrx = ashr exact i32 %x, 3
480 %ashry = ashr exact i32 %y, 3
481 %sel = select i1 %cmp, i32 %ashrx, i32 %ashry
482 %shl = shl i32 %sel, 5
486 ; fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
487 define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
488 ; SSE-LABEL: combine_vec_shl_lt_ashr_extact0:
490 ; SSE-NEXT: psrad $2, %xmm0
493 ; AVX-LABEL: combine_vec_shl_lt_ashr_extact0:
495 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
497 %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
498 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
502 define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
503 ; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
505 ; SSE2-NEXT: movdqa %xmm0, %xmm1
506 ; SSE2-NEXT: psrad $5, %xmm1
507 ; SSE2-NEXT: movdqa %xmm0, %xmm2
508 ; SSE2-NEXT: psrad $7, %xmm2
509 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
510 ; SSE2-NEXT: movdqa %xmm0, %xmm1
511 ; SSE2-NEXT: psrad $8, %xmm1
512 ; SSE2-NEXT: psrad $6, %xmm0
513 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
514 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256]
515 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
516 ; SSE2-NEXT: pmuludq %xmm0, %xmm3
517 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
518 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
519 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
520 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
523 ; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
525 ; SSE41-NEXT: movdqa %xmm0, %xmm1
526 ; SSE41-NEXT: psrad $8, %xmm1
527 ; SSE41-NEXT: movdqa %xmm0, %xmm2
528 ; SSE41-NEXT: psrad $6, %xmm2
529 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
530 ; SSE41-NEXT: movdqa %xmm0, %xmm1
531 ; SSE41-NEXT: psrad $7, %xmm1
532 ; SSE41-NEXT: psrad $5, %xmm0
533 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
534 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
535 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
538 ; AVX-LABEL: combine_vec_shl_lt_ashr_extact1:
540 ; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
541 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
543 %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
544 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
548 ; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
549 define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
550 ; SSE-LABEL: combine_vec_shl_gt_lshr0:
552 ; SSE-NEXT: pslld $2, %xmm0
553 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
556 ; AVX-LABEL: combine_vec_shl_gt_lshr0:
558 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
559 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0
560 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
562 %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
563 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
567 define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
568 ; SSE2-LABEL: combine_vec_shl_gt_lshr1:
570 ; SSE2-NEXT: movdqa %xmm0, %xmm1
571 ; SSE2-NEXT: psrld $3, %xmm1
572 ; SSE2-NEXT: movdqa %xmm0, %xmm2
573 ; SSE2-NEXT: psrld $5, %xmm2
574 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
575 ; SSE2-NEXT: movdqa %xmm0, %xmm1
576 ; SSE2-NEXT: psrld $8, %xmm1
577 ; SSE2-NEXT: psrld $4, %xmm0
578 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
579 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256]
580 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
581 ; SSE2-NEXT: pmuludq %xmm0, %xmm3
582 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
583 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
584 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
585 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
588 ; SSE41-LABEL: combine_vec_shl_gt_lshr1:
590 ; SSE41-NEXT: movdqa %xmm0, %xmm1
591 ; SSE41-NEXT: psrld $8, %xmm1
592 ; SSE41-NEXT: movdqa %xmm0, %xmm2
593 ; SSE41-NEXT: psrld $4, %xmm2
594 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
595 ; SSE41-NEXT: movdqa %xmm0, %xmm1
596 ; SSE41-NEXT: psrld $5, %xmm1
597 ; SSE41-NEXT: psrld $3, %xmm0
598 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
599 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
600 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
603 ; AVX-LABEL: combine_vec_shl_gt_lshr1:
605 ; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
606 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
608 %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
609 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
613 ; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
614 define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
615 ; SSE-LABEL: combine_vec_shl_le_lshr0:
617 ; SSE-NEXT: psrld $2, %xmm0
618 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
621 ; AVX-LABEL: combine_vec_shl_le_lshr0:
623 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
624 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
625 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
627 %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
628 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
632 define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
633 ; SSE2-LABEL: combine_vec_shl_le_lshr1:
635 ; SSE2-NEXT: movdqa %xmm0, %xmm1
636 ; SSE2-NEXT: psrld $5, %xmm1
637 ; SSE2-NEXT: movdqa %xmm0, %xmm2
638 ; SSE2-NEXT: psrld $7, %xmm2
639 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
640 ; SSE2-NEXT: movdqa %xmm0, %xmm1
641 ; SSE2-NEXT: psrld $8, %xmm1
642 ; SSE2-NEXT: psrld $6, %xmm0
643 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
644 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256]
645 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
646 ; SSE2-NEXT: pmuludq %xmm0, %xmm3
647 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
648 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
649 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
650 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
653 ; SSE41-LABEL: combine_vec_shl_le_lshr1:
655 ; SSE41-NEXT: movdqa %xmm0, %xmm1
656 ; SSE41-NEXT: psrld $8, %xmm1
657 ; SSE41-NEXT: movdqa %xmm0, %xmm2
658 ; SSE41-NEXT: psrld $6, %xmm2
659 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
660 ; SSE41-NEXT: movdqa %xmm0, %xmm1
661 ; SSE41-NEXT: psrld $7, %xmm1
662 ; SSE41-NEXT: psrld $5, %xmm0
663 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
664 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
665 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
668 ; AVX-LABEL: combine_vec_shl_le_lshr1:
670 ; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
673 %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
674 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
678 ; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
679 define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
680 ; SSE-LABEL: combine_vec_shl_ashr0:
682 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
685 ; AVX-LABEL: combine_vec_shl_ashr0:
687 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
688 ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
690 %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
691 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
695 define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
696 ; SSE-LABEL: combine_vec_shl_ashr1:
698 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
701 ; AVX-LABEL: combine_vec_shl_ashr1:
703 ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
705 %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
706 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
710 ; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
711 define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
712 ; SSE-LABEL: combine_vec_shl_add0:
714 ; SSE-NEXT: pslld $2, %xmm0
715 ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
718 ; AVX-LABEL: combine_vec_shl_add0:
720 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0
721 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
722 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
724 %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
725 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
729 define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
730 ; SSE2-LABEL: combine_vec_shl_add1:
732 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,16]
733 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
734 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
735 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
736 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
737 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
738 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
739 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
740 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
743 ; SSE41-LABEL: combine_vec_shl_add1:
745 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
746 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
749 ; AVX-LABEL: combine_vec_shl_add1:
751 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
752 ; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
754 %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
755 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
759 ; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
760 define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
761 ; SSE-LABEL: combine_vec_shl_or0:
763 ; SSE-NEXT: pslld $2, %xmm0
764 ; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
767 ; AVX-LABEL: combine_vec_shl_or0:
769 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0
770 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
771 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
773 %1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
774 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
778 define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
779 ; SSE2-LABEL: combine_vec_shl_or1:
781 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,16]
782 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
783 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
784 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
785 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
786 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
787 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
788 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
789 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
792 ; SSE41-LABEL: combine_vec_shl_or1:
794 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
795 ; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
798 ; AVX-LABEL: combine_vec_shl_or1:
800 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
801 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
803 %1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
804 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
808 ; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
809 define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
810 ; SSE2-LABEL: combine_vec_shl_mul0:
812 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20]
813 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
814 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
815 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
816 ; SSE2-NEXT: pmuludq %xmm1, %xmm2
817 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
818 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
821 ; SSE41-LABEL: combine_vec_shl_mul0:
823 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
826 ; AVX-LABEL: combine_vec_shl_mul0:
828 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
829 ; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
831 %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
832 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
836 define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
837 ; SSE2-LABEL: combine_vec_shl_mul1:
839 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [10,24,56,128]
840 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
841 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
842 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
843 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
844 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
845 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
846 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
849 ; SSE41-LABEL: combine_vec_shl_mul1:
851 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
854 ; AVX-LABEL: combine_vec_shl_mul1:
856 ; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
858 %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
859 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
863 ; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2)
864 define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) {
865 ; SSE2-LABEL: combine_vec_add_shl_nonsplat:
867 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4,8,16,32]
868 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
869 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
870 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
871 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
872 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
873 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
874 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
875 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
878 ; SSE41-LABEL: combine_vec_add_shl_nonsplat:
880 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
881 ; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
884 ; AVX-LABEL: combine_vec_add_shl_nonsplat:
886 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
887 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
888 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
890 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
891 %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
895 define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) {
896 ; SSE2-LABEL: combine_vec_add_shl_and_nonsplat:
898 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
899 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4,8,16,32]
900 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
901 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
902 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
903 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
904 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
905 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
906 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
907 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
910 ; SSE41-LABEL: combine_vec_add_shl_and_nonsplat:
912 ; SSE41-NEXT: pxor %xmm1, %xmm1
913 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
914 ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
915 ; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
918 ; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
920 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
921 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
922 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
923 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
924 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
926 %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
927 %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
928 %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
932 define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0) {
933 ; SSE2-LABEL: combine_vec_add_shuffle_shl:
935 ; SSE2-NEXT: movdqa %xmm0, %xmm1
936 ; SSE2-NEXT: pslld $3, %xmm1
937 ; SSE2-NEXT: pslld $2, %xmm0
938 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
939 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0]
940 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
943 ; SSE41-LABEL: combine_vec_add_shuffle_shl:
945 ; SSE41-NEXT: movdqa %xmm0, %xmm1
946 ; SSE41-NEXT: pslld $3, %xmm1
947 ; SSE41-NEXT: pslld $2, %xmm0
948 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
949 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
950 ; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
953 ; AVX-LABEL: combine_vec_add_shuffle_shl:
955 ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
956 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
957 ; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
958 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
960 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
961 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
962 %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>