1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-SLOW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-ALL
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X86,X86-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-ALL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST-PERLANE
9 ; AVX2 Logical Shift Left
11 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
12 ; X86-LABEL: test_sllw_1:
13 ; X86: # %bb.0: # %entry
16 ; X64-LABEL: test_sllw_1:
17 ; X64: # %bb.0: # %entry
20 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
24 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
25 ; X86-LABEL: test_sllw_2:
26 ; X86: # %bb.0: # %entry
27 ; X86-NEXT: vpaddw %ymm0, %ymm0, %ymm0
30 ; X64-LABEL: test_sllw_2:
31 ; X64: # %bb.0: # %entry
32 ; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0
35 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
39 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
40 ; X86-LABEL: test_sllw_3:
41 ; X86: # %bb.0: # %entry
42 ; X86-NEXT: vpsllw $15, %ymm0, %ymm0
45 ; X64-LABEL: test_sllw_3:
46 ; X64: # %bb.0: # %entry
47 ; X64-NEXT: vpsllw $15, %ymm0, %ymm0
50 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
54 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
55 ; X86-LABEL: test_slld_1:
56 ; X86: # %bb.0: # %entry
59 ; X64-LABEL: test_slld_1:
60 ; X64: # %bb.0: # %entry
63 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
67 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
68 ; X86-LABEL: test_slld_2:
69 ; X86: # %bb.0: # %entry
70 ; X86-NEXT: vpaddd %ymm0, %ymm0, %ymm0
73 ; X64-LABEL: test_slld_2:
74 ; X64: # %bb.0: # %entry
75 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
78 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
82 define <8 x i32> @test_vpslld_var(i32 %shift) {
83 ; X86-LABEL: test_vpslld_var:
85 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
86 ; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
87 ; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0
90 ; X64-LABEL: test_vpslld_var:
92 ; X64-NEXT: vmovd %edi, %xmm0
93 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
94 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
96 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
97 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
101 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
102 ; X86-LABEL: test_slld_3:
103 ; X86: # %bb.0: # %entry
104 ; X86-NEXT: vpslld $31, %ymm0, %ymm0
107 ; X64-LABEL: test_slld_3:
108 ; X64: # %bb.0: # %entry
109 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
112 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
116 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
117 ; X86-LABEL: test_sllq_1:
118 ; X86: # %bb.0: # %entry
121 ; X64-LABEL: test_sllq_1:
122 ; X64: # %bb.0: # %entry
125 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
129 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
130 ; X86-LABEL: test_sllq_2:
131 ; X86: # %bb.0: # %entry
132 ; X86-NEXT: vpaddq %ymm0, %ymm0, %ymm0
135 ; X64-LABEL: test_sllq_2:
136 ; X64: # %bb.0: # %entry
137 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
140 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
144 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
145 ; X86-LABEL: test_sllq_3:
146 ; X86: # %bb.0: # %entry
147 ; X86-NEXT: vpsllq $63, %ymm0, %ymm0
150 ; X64-LABEL: test_sllq_3:
151 ; X64: # %bb.0: # %entry
152 ; X64-NEXT: vpsllq $63, %ymm0, %ymm0
155 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
159 ; AVX2 Arithmetic Shift
161 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
162 ; X86-LABEL: test_sraw_1:
163 ; X86: # %bb.0: # %entry
166 ; X64-LABEL: test_sraw_1:
167 ; X64: # %bb.0: # %entry
170 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
174 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
175 ; X86-LABEL: test_sraw_2:
176 ; X86: # %bb.0: # %entry
177 ; X86-NEXT: vpsraw $1, %ymm0, %ymm0
180 ; X64-LABEL: test_sraw_2:
181 ; X64: # %bb.0: # %entry
182 ; X64-NEXT: vpsraw $1, %ymm0, %ymm0
185 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
189 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
190 ; X86-LABEL: test_sraw_3:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: vpsraw $15, %ymm0, %ymm0
195 ; X64-LABEL: test_sraw_3:
196 ; X64: # %bb.0: # %entry
197 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0
200 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
204 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
205 ; X86-LABEL: test_srad_1:
206 ; X86: # %bb.0: # %entry
209 ; X64-LABEL: test_srad_1:
210 ; X64: # %bb.0: # %entry
213 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
217 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
218 ; X86-LABEL: test_srad_2:
219 ; X86: # %bb.0: # %entry
220 ; X86-NEXT: vpsrad $1, %ymm0, %ymm0
223 ; X64-LABEL: test_srad_2:
224 ; X64: # %bb.0: # %entry
225 ; X64-NEXT: vpsrad $1, %ymm0, %ymm0
228 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
232 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
233 ; X86-LABEL: test_srad_3:
234 ; X86: # %bb.0: # %entry
235 ; X86-NEXT: vpsrad $31, %ymm0, %ymm0
238 ; X64-LABEL: test_srad_3:
239 ; X64: # %bb.0: # %entry
240 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
243 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
247 ; SSE Logical Shift Right
249 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
250 ; X86-LABEL: test_srlw_1:
251 ; X86: # %bb.0: # %entry
254 ; X64-LABEL: test_srlw_1:
255 ; X64: # %bb.0: # %entry
258 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
262 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
263 ; X86-LABEL: test_srlw_2:
264 ; X86: # %bb.0: # %entry
265 ; X86-NEXT: vpsrlw $1, %ymm0, %ymm0
268 ; X64-LABEL: test_srlw_2:
269 ; X64: # %bb.0: # %entry
270 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm0
273 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
277 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
278 ; X86-LABEL: test_srlw_3:
279 ; X86: # %bb.0: # %entry
280 ; X86-NEXT: vpsrlw $15, %ymm0, %ymm0
283 ; X64-LABEL: test_srlw_3:
284 ; X64: # %bb.0: # %entry
285 ; X64-NEXT: vpsrlw $15, %ymm0, %ymm0
288 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
292 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
293 ; X86-LABEL: test_srld_1:
294 ; X86: # %bb.0: # %entry
297 ; X64-LABEL: test_srld_1:
298 ; X64: # %bb.0: # %entry
301 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
305 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
306 ; X86-LABEL: test_srld_2:
307 ; X86: # %bb.0: # %entry
308 ; X86-NEXT: vpsrld $1, %ymm0, %ymm0
311 ; X64-LABEL: test_srld_2:
312 ; X64: # %bb.0: # %entry
313 ; X64-NEXT: vpsrld $1, %ymm0, %ymm0
316 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
320 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
321 ; X86-LABEL: test_srld_3:
322 ; X86: # %bb.0: # %entry
323 ; X86-NEXT: vpsrld $31, %ymm0, %ymm0
326 ; X64-LABEL: test_srld_3:
327 ; X64: # %bb.0: # %entry
328 ; X64-NEXT: vpsrld $31, %ymm0, %ymm0
331 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
335 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
336 ; X86-LABEL: test_srlq_1:
337 ; X86: # %bb.0: # %entry
340 ; X64-LABEL: test_srlq_1:
341 ; X64: # %bb.0: # %entry
344 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
348 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
349 ; X86-LABEL: test_srlq_2:
350 ; X86: # %bb.0: # %entry
351 ; X86-NEXT: vpsrlq $1, %ymm0, %ymm0
354 ; X64-LABEL: test_srlq_2:
355 ; X64: # %bb.0: # %entry
356 ; X64-NEXT: vpsrlq $1, %ymm0, %ymm0
359 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
363 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
364 ; X86-LABEL: test_srlq_3:
365 ; X86: # %bb.0: # %entry
366 ; X86-NEXT: vpsrlq $63, %ymm0, %ymm0
369 ; X64-LABEL: test_srlq_3:
370 ; X64: # %bb.0: # %entry
371 ; X64-NEXT: vpsrlq $63, %ymm0, %ymm0
374 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
378 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
379 ; X86-SLOW-LABEL: srl_trunc_and_v4i64:
381 ; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
382 ; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
383 ; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
384 ; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
385 ; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
386 ; X86-SLOW-NEXT: vzeroupper
387 ; X86-SLOW-NEXT: retl
389 ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
390 ; X86-FAST-ALL: # %bb.0:
391 ; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
392 ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
393 ; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
394 ; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
395 ; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
396 ; X86-FAST-ALL-NEXT: vzeroupper
397 ; X86-FAST-ALL-NEXT: retl
399 ; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
400 ; X86-FAST-PERLANE: # %bb.0:
401 ; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
402 ; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
403 ; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
404 ; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
405 ; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
406 ; X86-FAST-PERLANE-NEXT: vzeroupper
407 ; X86-FAST-PERLANE-NEXT: retl
409 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
411 ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
412 ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
413 ; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
414 ; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
415 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
416 ; X64-SLOW-NEXT: vzeroupper
417 ; X64-SLOW-NEXT: retq
419 ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
420 ; X64-FAST-ALL: # %bb.0:
421 ; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
422 ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
423 ; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
424 ; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
425 ; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
426 ; X64-FAST-ALL-NEXT: vzeroupper
427 ; X64-FAST-ALL-NEXT: retq
429 ; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
430 ; X64-FAST-PERLANE: # %bb.0:
431 ; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
432 ; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
433 ; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
434 ; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
435 ; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
436 ; X64-FAST-PERLANE-NEXT: vzeroupper
437 ; X64-FAST-PERLANE-NEXT: retq
438 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
439 %trunc = trunc <4 x i64> %and to <4 x i32>
440 %sra = lshr <4 x i32> %x, %trunc
445 ; Vectorized byte shifts
448 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
449 ; X86-LABEL: shl_8i16:
451 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
452 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
453 ; X86-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
454 ; X86-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
455 ; X86-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
456 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
457 ; X86-NEXT: vzeroupper
460 ; X64-LABEL: shl_8i16:
462 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
463 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
464 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
465 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
466 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
467 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
468 ; X64-NEXT: vzeroupper
470 %shl = shl <8 x i16> %r, %a
474 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
475 ; X86-LABEL: shl_16i16:
477 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
478 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
479 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
480 ; X86-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
481 ; X86-NEXT: vpsrld $16, %ymm3, %ymm3
482 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
483 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
484 ; X86-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
485 ; X86-NEXT: vpsrld $16, %ymm0, %ymm0
486 ; X86-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
489 ; X64-LABEL: shl_16i16:
491 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
492 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
493 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
494 ; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
495 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
496 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
497 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
498 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
499 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
500 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
502 %shl = shl <16 x i16> %r, %a
506 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
507 ; X86-LABEL: shl_32i8:
509 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
510 ; X86-NEXT: vpsllw $4, %ymm0, %ymm2
511 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
512 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513 ; X86-NEXT: vpsllw $2, %ymm0, %ymm2
514 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
515 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
516 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
517 ; X86-NEXT: vpaddb %ymm0, %ymm0, %ymm2
518 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
519 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
522 ; X64-LABEL: shl_32i8:
524 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
525 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
526 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
527 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
528 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
529 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
530 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
531 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
532 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
533 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
534 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
536 %shl = shl <32 x i8> %r, %a
540 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
541 ; X86-LABEL: ashr_8i16:
543 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
544 ; X86-NEXT: vpmovsxwd %xmm0, %ymm0
545 ; X86-NEXT: vpsravd %ymm1, %ymm0, %ymm0
546 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
547 ; X86-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
548 ; X86-NEXT: vzeroupper
551 ; X64-LABEL: ashr_8i16:
553 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
554 ; X64-NEXT: vpmovsxwd %xmm0, %ymm0
555 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
556 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
557 ; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
558 ; X64-NEXT: vzeroupper
560 %ashr = ashr <8 x i16> %r, %a
564 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
565 ; X86-LABEL: ashr_16i16:
567 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
568 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
569 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
570 ; X86-NEXT: vpsravd %ymm3, %ymm4, %ymm3
571 ; X86-NEXT: vpsrld $16, %ymm3, %ymm3
572 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
573 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
574 ; X86-NEXT: vpsravd %ymm1, %ymm0, %ymm0
575 ; X86-NEXT: vpsrld $16, %ymm0, %ymm0
576 ; X86-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
579 ; X64-LABEL: ashr_16i16:
581 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
582 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
583 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
584 ; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
585 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
586 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
587 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
588 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
589 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
590 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
592 %ashr = ashr <16 x i16> %r, %a
596 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
597 ; X86-LABEL: ashr_32i8:
599 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
600 ; X86-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
601 ; X86-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
602 ; X86-NEXT: vpsraw $4, %ymm3, %ymm4
603 ; X86-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
604 ; X86-NEXT: vpsraw $2, %ymm3, %ymm4
605 ; X86-NEXT: vpaddw %ymm2, %ymm2, %ymm2
606 ; X86-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
607 ; X86-NEXT: vpsraw $1, %ymm3, %ymm4
608 ; X86-NEXT: vpaddw %ymm2, %ymm2, %ymm2
609 ; X86-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
610 ; X86-NEXT: vpsrlw $8, %ymm2, %ymm2
611 ; X86-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
612 ; X86-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
613 ; X86-NEXT: vpsraw $4, %ymm0, %ymm3
614 ; X86-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
615 ; X86-NEXT: vpsraw $2, %ymm0, %ymm3
616 ; X86-NEXT: vpaddw %ymm1, %ymm1, %ymm1
617 ; X86-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
618 ; X86-NEXT: vpsraw $1, %ymm0, %ymm3
619 ; X86-NEXT: vpaddw %ymm1, %ymm1, %ymm1
620 ; X86-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
621 ; X86-NEXT: vpsrlw $8, %ymm0, %ymm0
622 ; X86-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
625 ; X64-LABEL: ashr_32i8:
627 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
628 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
629 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
630 ; X64-NEXT: vpsraw $4, %ymm3, %ymm4
631 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
632 ; X64-NEXT: vpsraw $2, %ymm3, %ymm4
633 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
634 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
635 ; X64-NEXT: vpsraw $1, %ymm3, %ymm4
636 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
637 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
638 ; X64-NEXT: vpsrlw $8, %ymm2, %ymm2
639 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
640 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
641 ; X64-NEXT: vpsraw $4, %ymm0, %ymm3
642 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
643 ; X64-NEXT: vpsraw $2, %ymm0, %ymm3
644 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
645 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
646 ; X64-NEXT: vpsraw $1, %ymm0, %ymm3
647 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
648 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
649 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
650 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
652 %ashr = ashr <32 x i8> %r, %a
656 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
657 ; X86-LABEL: lshr_8i16:
659 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
660 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
661 ; X86-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
662 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
663 ; X86-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
664 ; X86-NEXT: vzeroupper
667 ; X64-LABEL: lshr_8i16:
669 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
670 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
671 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
672 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
673 ; X64-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
674 ; X64-NEXT: vzeroupper
676 %lshr = lshr <8 x i16> %r, %a
680 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
681 ; X86-LABEL: lshr_16i16:
683 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
684 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
685 ; X86-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
686 ; X86-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
687 ; X86-NEXT: vpsrld $16, %ymm3, %ymm3
688 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
689 ; X86-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
690 ; X86-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
691 ; X86-NEXT: vpsrld $16, %ymm0, %ymm0
692 ; X86-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
695 ; X64-LABEL: lshr_16i16:
697 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
698 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
699 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
700 ; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
701 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
702 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
703 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
704 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
705 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
706 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
708 %lshr = lshr <16 x i16> %r, %a
712 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
713 ; X86-LABEL: lshr_32i8:
715 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
716 ; X86-NEXT: vpsrlw $4, %ymm0, %ymm2
717 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
718 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
719 ; X86-NEXT: vpsrlw $2, %ymm0, %ymm2
720 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
721 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
722 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
723 ; X86-NEXT: vpsrlw $1, %ymm0, %ymm2
724 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
725 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
726 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
729 ; X64-LABEL: lshr_32i8:
731 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
732 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
733 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
734 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
735 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
736 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
737 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
738 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
739 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
740 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
741 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
742 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
744 %lshr = lshr <32 x i8> %r, %a