1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
10 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
11 ; SSE-LABEL: combine_vec_lshr_zero:
13 ; SSE-NEXT: xorps %xmm0, %xmm0
16 ; AVX-LABEL: combine_vec_lshr_zero:
18 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
20 %1 = lshr <4 x i32> zeroinitializer, %x
24 ; fold (srl x, c >= size(x)) -> undef
25 define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) {
26 ; CHECK-LABEL: combine_vec_lshr_outofrange0:
29 %1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
33 define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
34 ; CHECK-LABEL: combine_vec_lshr_outofrange1:
37 %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
41 define <4 x i32> @combine_vec_lshr_outofrange2(<4 x i32> %x) {
42 ; CHECK-LABEL: combine_vec_lshr_outofrange2:
45 %1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
49 ; fold (srl x, 0) -> x
50 define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
51 ; CHECK-LABEL: combine_vec_lshr_by_zero:
54 %1 = lshr <4 x i32> %x, zeroinitializer
58 ; if (srl x, c) is known to be zero, return 0
59 define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
60 ; SSE-LABEL: combine_vec_lshr_known_zero0:
62 ; SSE-NEXT: xorps %xmm0, %xmm0
65 ; AVX-LABEL: combine_vec_lshr_known_zero0:
67 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
69 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
70 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
74 define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
75 ; SSE-LABEL: combine_vec_lshr_known_zero1:
77 ; SSE-NEXT: xorps %xmm0, %xmm0
80 ; AVX-LABEL: combine_vec_lshr_known_zero1:
82 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
84 %1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
85 %2 = lshr <4 x i32> %1, <i32 8, i32 9, i32 10, i32 11>
89 ; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2))
90 define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
91 ; SSE-LABEL: combine_vec_lshr_lshr0:
93 ; SSE-NEXT: psrld $6, %xmm0
96 ; AVX-LABEL: combine_vec_lshr_lshr0:
98 ; AVX-NEXT: vpsrld $6, %xmm0, %xmm0
100 %1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
101 %2 = lshr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
105 define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
106 ; SSE2-LABEL: combine_vec_lshr_lshr1:
108 ; SSE2-NEXT: movdqa %xmm0, %xmm1
109 ; SSE2-NEXT: psrld $10, %xmm1
110 ; SSE2-NEXT: movdqa %xmm0, %xmm2
111 ; SSE2-NEXT: psrld $8, %xmm2
112 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
113 ; SSE2-NEXT: movdqa %xmm0, %xmm1
114 ; SSE2-NEXT: psrld $6, %xmm1
115 ; SSE2-NEXT: psrld $4, %xmm0
116 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
117 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
120 ; SSE41-LABEL: combine_vec_lshr_lshr1:
122 ; SSE41-NEXT: movdqa %xmm0, %xmm1
123 ; SSE41-NEXT: psrld $10, %xmm1
124 ; SSE41-NEXT: movdqa %xmm0, %xmm2
125 ; SSE41-NEXT: psrld $6, %xmm2
126 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
127 ; SSE41-NEXT: movdqa %xmm0, %xmm1
128 ; SSE41-NEXT: psrld $8, %xmm1
129 ; SSE41-NEXT: psrld $4, %xmm0
130 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
131 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
134 ; AVX-LABEL: combine_vec_lshr_lshr1:
136 ; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
138 %1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
139 %2 = lshr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
143 ; fold (srl (srl x, c1), c2) -> 0
144 define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) {
145 ; SSE-LABEL: combine_vec_lshr_lshr_zero0:
147 ; SSE-NEXT: xorps %xmm0, %xmm0
150 ; AVX-LABEL: combine_vec_lshr_lshr_zero0:
152 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
154 %1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
155 %2 = lshr <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
159 define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
160 ; SSE-LABEL: combine_vec_lshr_lshr_zero1:
162 ; SSE-NEXT: xorps %xmm0, %xmm0
165 ; AVX-LABEL: combine_vec_lshr_lshr_zero1:
167 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
169 %1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
170 %2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
174 ; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
175 define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
176 ; SSE2-LABEL: combine_vec_lshr_trunc_lshr0:
178 ; SSE2-NEXT: psrlq $48, %xmm1
179 ; SSE2-NEXT: psrlq $48, %xmm0
180 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
183 ; SSE41-LABEL: combine_vec_lshr_trunc_lshr0:
185 ; SSE41-NEXT: psrlq $48, %xmm1
186 ; SSE41-NEXT: psrlq $48, %xmm0
187 ; SSE41-NEXT: packusdw %xmm1, %xmm0
190 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr0:
191 ; AVX2-SLOW: # %bb.0:
192 ; AVX2-SLOW-NEXT: vpsrlq $48, %ymm0, %ymm0
193 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
194 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
195 ; AVX2-SLOW-NEXT: vzeroupper
196 ; AVX2-SLOW-NEXT: retq
198 ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr0:
199 ; AVX2-FAST-ALL: # %bb.0:
200 ; AVX2-FAST-ALL-NEXT: vpsrlq $48, %ymm0, %ymm0
201 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
202 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
203 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
204 ; AVX2-FAST-ALL-NEXT: vzeroupper
205 ; AVX2-FAST-ALL-NEXT: retq
207 ; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr0:
208 ; AVX2-FAST-PERLANE: # %bb.0:
209 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %ymm0, %ymm0
210 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
211 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
212 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
213 ; AVX2-FAST-PERLANE-NEXT: retq
215 ; AVX512-LABEL: combine_vec_lshr_trunc_lshr0:
217 ; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm0
218 ; AVX512-NEXT: vpmovqd %ymm0, %xmm0
219 ; AVX512-NEXT: vzeroupper
221 %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
222 %2 = trunc <4 x i64> %1 to <4 x i32>
223 %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16>
227 define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
228 ; SSE2-LABEL: combine_vec_lshr_trunc_lshr1:
230 ; SSE2-NEXT: movdqa %xmm1, %xmm2
231 ; SSE2-NEXT: psrlq $34, %xmm2
232 ; SSE2-NEXT: psrlq $35, %xmm1
233 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
234 ; SSE2-NEXT: movdqa %xmm0, %xmm2
235 ; SSE2-NEXT: psrlq $32, %xmm2
236 ; SSE2-NEXT: psrlq $33, %xmm0
237 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
238 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
239 ; SSE2-NEXT: movaps %xmm0, %xmm1
240 ; SSE2-NEXT: psrld $19, %xmm1
241 ; SSE2-NEXT: movaps %xmm0, %xmm3
242 ; SSE2-NEXT: psrld $18, %xmm3
243 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
244 ; SSE2-NEXT: psrld $17, %xmm0
245 ; SSE2-NEXT: psrld $16, %xmm2
246 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
247 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
248 ; SSE2-NEXT: movaps %xmm2, %xmm0
251 ; SSE41-LABEL: combine_vec_lshr_trunc_lshr1:
253 ; SSE41-NEXT: movdqa %xmm1, %xmm2
254 ; SSE41-NEXT: psrlq $35, %xmm2
255 ; SSE41-NEXT: psrlq $34, %xmm1
256 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
257 ; SSE41-NEXT: movdqa %xmm0, %xmm2
258 ; SSE41-NEXT: psrlq $33, %xmm2
259 ; SSE41-NEXT: psrlq $32, %xmm0
260 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
261 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
262 ; SSE41-NEXT: movaps %xmm2, %xmm1
263 ; SSE41-NEXT: psrld $19, %xmm1
264 ; SSE41-NEXT: movaps %xmm2, %xmm3
265 ; SSE41-NEXT: psrld $17, %xmm3
266 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
267 ; SSE41-NEXT: psrld $18, %xmm2
268 ; SSE41-NEXT: psrld $16, %xmm0
269 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
270 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
273 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
274 ; AVX2-SLOW: # %bb.0:
275 ; AVX2-SLOW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
276 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
277 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
278 ; AVX2-SLOW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
279 ; AVX2-SLOW-NEXT: vzeroupper
280 ; AVX2-SLOW-NEXT: retq
282 ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1:
283 ; AVX2-FAST-ALL: # %bb.0:
284 ; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
285 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
286 ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
287 ; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
288 ; AVX2-FAST-ALL-NEXT: vzeroupper
289 ; AVX2-FAST-ALL-NEXT: retq
291 ; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_lshr1:
292 ; AVX2-FAST-PERLANE: # %bb.0:
293 ; AVX2-FAST-PERLANE-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
294 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
295 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
296 ; AVX2-FAST-PERLANE-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
297 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
298 ; AVX2-FAST-PERLANE-NEXT: retq
300 ; AVX512-LABEL: combine_vec_lshr_trunc_lshr1:
302 ; AVX512-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
303 ; AVX512-NEXT: vpmovqd %ymm0, %xmm0
304 ; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
305 ; AVX512-NEXT: vzeroupper
307 %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
308 %2 = trunc <4 x i64> %1 to <4 x i32>
309 %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
313 ; fold (srl (trunc (srl x, c1)), c2) -> 0
314 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
315 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0:
317 ; SSE-NEXT: xorps %xmm0, %xmm0
320 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0:
322 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
324 %1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48>
325 %2 = trunc <4 x i64> %1 to <4 x i32>
326 %3 = lshr <4 x i32> %2, <i32 24, i32 24, i32 24, i32 24>
330 define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
331 ; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
333 ; SSE-NEXT: xorps %xmm0, %xmm0
336 ; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
338 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
340 %1 = lshr <4 x i64> %x, <i64 48, i64 49, i64 50, i64 51>
341 %2 = trunc <4 x i64> %1 to <4 x i32>
342 %3 = lshr <4 x i32> %2, <i32 24, i32 25, i32 26, i32 27>
346 ; fold (srl (shl x, c), c) -> (and x, cst2)
347 define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
348 ; SSE-LABEL: combine_vec_lshr_shl_mask0:
350 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
353 ; AVX2-LABEL: combine_vec_lshr_shl_mask0:
355 ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
356 ; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
359 ; AVX512-LABEL: combine_vec_lshr_shl_mask0:
361 ; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
363 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
364 %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
368 define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
369 ; SSE-LABEL: combine_vec_lshr_shl_mask1:
371 ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
374 ; AVX-LABEL: combine_vec_lshr_shl_mask1:
376 ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
378 %1 = shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5>
379 %2 = lshr <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
383 ; fold (srl (sra X, Y), 31) -> (srl X, 31)
384 define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) {
385 ; SSE-LABEL: combine_vec_lshr_ashr_sign:
387 ; SSE-NEXT: psrld $31, %xmm0
390 ; AVX-LABEL: combine_vec_lshr_ashr_sign:
392 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
394 %1 = ashr <4 x i32> %x, %y
395 %2 = lshr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31>
399 ; fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
400 define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
401 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit0:
403 ; SSE-NEXT: psrld $4, %xmm0
404 ; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
407 ; AVX2-LABEL: combine_vec_lshr_lzcnt_bit0:
409 ; AVX2-NEXT: vpsrld $4, %xmm0, %xmm0
410 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
411 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
414 ; AVX512-LABEL: combine_vec_lshr_lzcnt_bit0:
416 ; AVX512-NEXT: vpsrld $4, %xmm0, %xmm0
417 ; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
419 %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
420 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
421 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
425 define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
426 ; SSE2-LABEL: combine_vec_lshr_lzcnt_bit1:
428 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
429 ; SSE2-NEXT: movdqa %xmm0, %xmm1
430 ; SSE2-NEXT: psrld $1, %xmm1
431 ; SSE2-NEXT: por %xmm1, %xmm0
432 ; SSE2-NEXT: movdqa %xmm0, %xmm1
433 ; SSE2-NEXT: psrld $2, %xmm1
434 ; SSE2-NEXT: por %xmm1, %xmm0
435 ; SSE2-NEXT: movdqa %xmm0, %xmm1
436 ; SSE2-NEXT: psrld $4, %xmm1
437 ; SSE2-NEXT: por %xmm1, %xmm0
438 ; SSE2-NEXT: movdqa %xmm0, %xmm1
439 ; SSE2-NEXT: psrld $8, %xmm1
440 ; SSE2-NEXT: por %xmm1, %xmm0
441 ; SSE2-NEXT: movdqa %xmm0, %xmm1
442 ; SSE2-NEXT: psrld $16, %xmm1
443 ; SSE2-NEXT: por %xmm1, %xmm0
444 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
445 ; SSE2-NEXT: pxor %xmm1, %xmm0
446 ; SSE2-NEXT: movdqa %xmm0, %xmm1
447 ; SSE2-NEXT: psrlw $1, %xmm1
448 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
449 ; SSE2-NEXT: psubb %xmm1, %xmm0
450 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
451 ; SSE2-NEXT: movdqa %xmm0, %xmm2
452 ; SSE2-NEXT: pand %xmm1, %xmm2
453 ; SSE2-NEXT: psrlw $2, %xmm0
454 ; SSE2-NEXT: pand %xmm1, %xmm0
455 ; SSE2-NEXT: paddb %xmm2, %xmm0
456 ; SSE2-NEXT: movdqa %xmm0, %xmm1
457 ; SSE2-NEXT: psrlw $4, %xmm1
458 ; SSE2-NEXT: paddb %xmm1, %xmm0
459 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
460 ; SSE2-NEXT: pxor %xmm1, %xmm1
461 ; SSE2-NEXT: movdqa %xmm0, %xmm2
462 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
463 ; SSE2-NEXT: psadbw %xmm1, %xmm2
464 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
465 ; SSE2-NEXT: psadbw %xmm1, %xmm0
466 ; SSE2-NEXT: packuswb %xmm2, %xmm0
467 ; SSE2-NEXT: psrld $5, %xmm0
470 ; SSE41-LABEL: combine_vec_lshr_lzcnt_bit1:
472 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
473 ; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
474 ; SSE41-NEXT: movdqa %xmm1, %xmm2
475 ; SSE41-NEXT: pshufb %xmm0, %xmm2
476 ; SSE41-NEXT: psrlw $4, %xmm0
477 ; SSE41-NEXT: pxor %xmm3, %xmm3
478 ; SSE41-NEXT: pshufb %xmm0, %xmm1
479 ; SSE41-NEXT: pcmpeqb %xmm3, %xmm0
480 ; SSE41-NEXT: pand %xmm2, %xmm0
481 ; SSE41-NEXT: paddb %xmm1, %xmm0
482 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
483 ; SSE41-NEXT: pand %xmm0, %xmm1
484 ; SSE41-NEXT: psrlw $8, %xmm0
485 ; SSE41-NEXT: paddw %xmm1, %xmm0
486 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
487 ; SSE41-NEXT: psrld $16, %xmm0
488 ; SSE41-NEXT: paddd %xmm3, %xmm0
489 ; SSE41-NEXT: psrld $5, %xmm0
492 ; AVX2-LABEL: combine_vec_lshr_lzcnt_bit1:
494 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
495 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
496 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm2
497 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
498 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
499 ; AVX2-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm4
500 ; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2
501 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
502 ; AVX2-NEXT: vpaddb %xmm0, %xmm2, %xmm0
503 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
504 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
505 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
506 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
507 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
508 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
509 ; AVX2-NEXT: vpsrld $5, %xmm0, %xmm0
512 ; AVX512-LABEL: combine_vec_lshr_lzcnt_bit1:
514 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
515 ; AVX512-NEXT: vplzcntd %xmm0, %xmm0
516 ; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
518 %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>
519 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
520 %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
523 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
525 ; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
526 define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
527 ; SSE2-LABEL: combine_vec_lshr_trunc_and:
529 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
530 ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
531 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
532 ; SSE2-NEXT: movdqa %xmm0, %xmm3
533 ; SSE2-NEXT: psrld %xmm2, %xmm3
534 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
535 ; SSE2-NEXT: movdqa %xmm0, %xmm2
536 ; SSE2-NEXT: psrld %xmm4, %xmm2
537 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
538 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
539 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
540 ; SSE2-NEXT: movdqa %xmm0, %xmm4
541 ; SSE2-NEXT: psrld %xmm3, %xmm4
542 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
543 ; SSE2-NEXT: psrld %xmm1, %xmm0
544 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
545 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
546 ; SSE2-NEXT: movaps %xmm2, %xmm0
549 ; SSE41-LABEL: combine_vec_lshr_trunc_and:
551 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
552 ; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
553 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
554 ; SSE41-NEXT: movdqa %xmm0, %xmm3
555 ; SSE41-NEXT: psrld %xmm2, %xmm3
556 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
557 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
558 ; SSE41-NEXT: movdqa %xmm0, %xmm5
559 ; SSE41-NEXT: psrld %xmm4, %xmm5
560 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
561 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
562 ; SSE41-NEXT: movdqa %xmm0, %xmm3
563 ; SSE41-NEXT: psrld %xmm1, %xmm3
564 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
565 ; SSE41-NEXT: psrld %xmm1, %xmm0
566 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
567 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
570 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
571 ; AVX2-SLOW: # %bb.0:
572 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
573 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
574 ; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
575 ; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
576 ; AVX2-SLOW-NEXT: vzeroupper
577 ; AVX2-SLOW-NEXT: retq
579 ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and:
580 ; AVX2-FAST-ALL: # %bb.0:
581 ; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
582 ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
583 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
584 ; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
585 ; AVX2-FAST-ALL-NEXT: vzeroupper
586 ; AVX2-FAST-ALL-NEXT: retq
588 ; AVX2-FAST-PERLANE-LABEL: combine_vec_lshr_trunc_and:
589 ; AVX2-FAST-PERLANE: # %bb.0:
590 ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
591 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
592 ; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
593 ; AVX2-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
594 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
595 ; AVX2-FAST-PERLANE-NEXT: retq
597 ; AVX512-LABEL: combine_vec_lshr_trunc_and:
599 ; AVX512-NEXT: vpmovqd %ymm1, %xmm1
600 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
601 ; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
602 ; AVX512-NEXT: vzeroupper
604 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
605 %2 = trunc <4 x i64> %1 to <4 x i32>
606 %3 = lshr <4 x i32> %x, %2
610 define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
611 ; SSE2-LABEL: combine_vec_lshr_clamped1:
613 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
614 ; SSE2-NEXT: pxor %xmm1, %xmm2
615 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
616 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
617 ; SSE2-NEXT: movdqa %xmm0, %xmm4
618 ; SSE2-NEXT: psrld %xmm3, %xmm4
619 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
620 ; SSE2-NEXT: movdqa %xmm0, %xmm5
621 ; SSE2-NEXT: psrld %xmm3, %xmm5
622 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
623 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
624 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
625 ; SSE2-NEXT: movdqa %xmm0, %xmm4
626 ; SSE2-NEXT: psrld %xmm3, %xmm4
627 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
628 ; SSE2-NEXT: psrld %xmm1, %xmm0
629 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
630 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3]
631 ; SSE2-NEXT: pandn %xmm5, %xmm2
632 ; SSE2-NEXT: movdqa %xmm2, %xmm0
635 ; SSE41-LABEL: combine_vec_lshr_clamped1:
637 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
638 ; SSE41-NEXT: pminud %xmm1, %xmm2
639 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
640 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
641 ; SSE41-NEXT: movdqa %xmm0, %xmm4
642 ; SSE41-NEXT: psrld %xmm3, %xmm4
643 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
644 ; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
645 ; SSE41-NEXT: movdqa %xmm0, %xmm6
646 ; SSE41-NEXT: psrld %xmm5, %xmm6
647 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
648 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
649 ; SSE41-NEXT: movdqa %xmm0, %xmm4
650 ; SSE41-NEXT: psrld %xmm1, %xmm4
651 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
652 ; SSE41-NEXT: psrld %xmm1, %xmm0
653 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
654 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
655 ; SSE41-NEXT: pand %xmm2, %xmm0
658 ; AVX-LABEL: combine_vec_lshr_clamped1:
660 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
662 %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
663 %shr = lshr <4 x i32> %sh, %amt
664 %1 = select <4 x i1> %cmp.i, <4 x i32> %shr, <4 x i32> zeroinitializer
668 define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
669 ; SSE2-LABEL: combine_vec_lshr_clamped2:
671 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
672 ; SSE2-NEXT: pxor %xmm1, %xmm2
673 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
674 ; SSE2-NEXT: pandn %xmm0, %xmm2
675 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
676 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
677 ; SSE2-NEXT: movdqa %xmm2, %xmm4
678 ; SSE2-NEXT: psrld %xmm3, %xmm4
679 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
680 ; SSE2-NEXT: movdqa %xmm2, %xmm3
681 ; SSE2-NEXT: psrld %xmm0, %xmm3
682 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
683 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
684 ; SSE2-NEXT: movdqa %xmm2, %xmm4
685 ; SSE2-NEXT: psrld %xmm0, %xmm4
686 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
687 ; SSE2-NEXT: psrld %xmm0, %xmm2
688 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
689 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
690 ; SSE2-NEXT: movaps %xmm2, %xmm0
693 ; SSE41-LABEL: combine_vec_lshr_clamped2:
695 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
696 ; SSE41-NEXT: pminud %xmm1, %xmm2
697 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
698 ; SSE41-NEXT: pand %xmm2, %xmm0
699 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
700 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
701 ; SSE41-NEXT: movdqa %xmm0, %xmm4
702 ; SSE41-NEXT: psrld %xmm3, %xmm4
703 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
704 ; SSE41-NEXT: movdqa %xmm0, %xmm5
705 ; SSE41-NEXT: psrld %xmm3, %xmm5
706 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
707 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
708 ; SSE41-NEXT: movdqa %xmm0, %xmm3
709 ; SSE41-NEXT: psrld %xmm2, %xmm3
710 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
711 ; SSE41-NEXT: psrld %xmm1, %xmm0
712 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
713 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
716 ; AVX-LABEL: combine_vec_lshr_clamped2:
718 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
720 %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
721 %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
722 %shr = lshr <4 x i32> %1, %amt
726 define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
727 ; SSE2-LABEL: combine_vec_lshr_commuted_clamped:
729 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
730 ; SSE2-NEXT: pxor %xmm1, %xmm2
731 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
732 ; SSE2-NEXT: pandn %xmm0, %xmm2
733 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
734 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
735 ; SSE2-NEXT: movdqa %xmm2, %xmm4
736 ; SSE2-NEXT: psrld %xmm3, %xmm4
737 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
738 ; SSE2-NEXT: movdqa %xmm2, %xmm3
739 ; SSE2-NEXT: psrld %xmm0, %xmm3
740 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
741 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
742 ; SSE2-NEXT: movdqa %xmm2, %xmm4
743 ; SSE2-NEXT: psrld %xmm0, %xmm4
744 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
745 ; SSE2-NEXT: psrld %xmm0, %xmm2
746 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
747 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
748 ; SSE2-NEXT: movaps %xmm2, %xmm0
751 ; SSE41-LABEL: combine_vec_lshr_commuted_clamped:
753 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
754 ; SSE41-NEXT: pminud %xmm1, %xmm2
755 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
756 ; SSE41-NEXT: pand %xmm2, %xmm0
757 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
758 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
759 ; SSE41-NEXT: movdqa %xmm0, %xmm4
760 ; SSE41-NEXT: psrld %xmm3, %xmm4
761 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
762 ; SSE41-NEXT: movdqa %xmm0, %xmm5
763 ; SSE41-NEXT: psrld %xmm3, %xmm5
764 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
765 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
766 ; SSE41-NEXT: movdqa %xmm0, %xmm3
767 ; SSE41-NEXT: psrld %xmm2, %xmm3
768 ; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
769 ; SSE41-NEXT: psrld %xmm1, %xmm0
770 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
771 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
774 ; AVX-LABEL: combine_vec_lshr_commuted_clamped:
776 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
778 %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
779 %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
780 %shr = lshr <4 x i32> %1, %amt
784 define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
785 ; SSE2-LABEL: combine_vec_lshr_commuted_clamped1:
787 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
788 ; SSE2-NEXT: movdqa %xmm0, %xmm3
789 ; SSE2-NEXT: psrld %xmm2, %xmm3
790 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
791 ; SSE2-NEXT: movdqa %xmm0, %xmm4
792 ; SSE2-NEXT: psrld %xmm2, %xmm4
793 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
794 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
795 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
796 ; SSE2-NEXT: movdqa %xmm0, %xmm5
797 ; SSE2-NEXT: psrld %xmm3, %xmm5
798 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
799 ; SSE2-NEXT: psrld %xmm2, %xmm0
800 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1]
801 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
802 ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
803 ; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
804 ; SSE2-NEXT: pandn %xmm4, %xmm1
805 ; SSE2-NEXT: movdqa %xmm1, %xmm0
808 ; SSE41-LABEL: combine_vec_lshr_commuted_clamped1:
810 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
811 ; SSE41-NEXT: movdqa %xmm0, %xmm3
812 ; SSE41-NEXT: psrld %xmm2, %xmm3
813 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
814 ; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
815 ; SSE41-NEXT: movdqa %xmm0, %xmm5
816 ; SSE41-NEXT: psrld %xmm4, %xmm5
817 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
818 ; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
819 ; SSE41-NEXT: movdqa %xmm0, %xmm4
820 ; SSE41-NEXT: psrld %xmm3, %xmm4
821 ; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
822 ; SSE41-NEXT: psrld %xmm2, %xmm0
823 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
824 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
825 ; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
826 ; SSE41-NEXT: pminud %xmm1, %xmm2
827 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
828 ; SSE41-NEXT: pand %xmm2, %xmm0
831 ; AVX-LABEL: combine_vec_lshr_commuted_clamped1:
833 ; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
835 %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
836 %shr = lshr <4 x i32> %sh, %amt
837 %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr