1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
5 ; AVX2 Logical Shift Left
7 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
8 ; X32-LABEL: test_sllw_1:
9 ; X32: # BB#0: # %entry
12 ; X64-LABEL: test_sllw_1:
13 ; X64: # BB#0: # %entry
16 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
20 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
21 ; X32-LABEL: test_sllw_2:
22 ; X32: # BB#0: # %entry
23 ; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0
26 ; X64-LABEL: test_sllw_2:
27 ; X64: # BB#0: # %entry
28 ; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0
31 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
35 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
36 ; X32-LABEL: test_sllw_3:
37 ; X32: # BB#0: # %entry
38 ; X32-NEXT: vpsllw $15, %ymm0, %ymm0
41 ; X64-LABEL: test_sllw_3:
42 ; X64: # BB#0: # %entry
43 ; X64-NEXT: vpsllw $15, %ymm0, %ymm0
46 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
50 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
51 ; X32-LABEL: test_slld_1:
52 ; X32: # BB#0: # %entry
55 ; X64-LABEL: test_slld_1:
56 ; X64: # BB#0: # %entry
59 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
63 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
64 ; X32-LABEL: test_slld_2:
65 ; X32: # BB#0: # %entry
66 ; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
69 ; X64-LABEL: test_slld_2:
70 ; X64: # BB#0: # %entry
71 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
74 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
78 define <8 x i32> @test_vpslld_var(i32 %shift) {
79 ; X32-LABEL: test_vpslld_var:
81 ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
82 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
83 ; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0
86 ; X64-LABEL: test_vpslld_var:
88 ; X64-NEXT: vmovd %edi, %xmm0
89 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
90 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
92 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
93 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
97 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
98 ; X32-LABEL: test_slld_3:
99 ; X32: # BB#0: # %entry
100 ; X32-NEXT: vpslld $31, %ymm0, %ymm0
103 ; X64-LABEL: test_slld_3:
104 ; X64: # BB#0: # %entry
105 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
108 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
112 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
113 ; X32-LABEL: test_sllq_1:
114 ; X32: # BB#0: # %entry
117 ; X64-LABEL: test_sllq_1:
118 ; X64: # BB#0: # %entry
121 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
125 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
126 ; X32-LABEL: test_sllq_2:
127 ; X32: # BB#0: # %entry
128 ; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
131 ; X64-LABEL: test_sllq_2:
132 ; X64: # BB#0: # %entry
133 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
136 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
140 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
141 ; X32-LABEL: test_sllq_3:
142 ; X32: # BB#0: # %entry
143 ; X32-NEXT: vpsllq $63, %ymm0, %ymm0
146 ; X64-LABEL: test_sllq_3:
147 ; X64: # BB#0: # %entry
148 ; X64-NEXT: vpsllq $63, %ymm0, %ymm0
151 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
155 ; AVX2 Arithmetic Shift
157 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
158 ; X32-LABEL: test_sraw_1:
159 ; X32: # BB#0: # %entry
162 ; X64-LABEL: test_sraw_1:
163 ; X64: # BB#0: # %entry
166 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
170 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
171 ; X32-LABEL: test_sraw_2:
172 ; X32: # BB#0: # %entry
173 ; X32-NEXT: vpsraw $1, %ymm0, %ymm0
176 ; X64-LABEL: test_sraw_2:
177 ; X64: # BB#0: # %entry
178 ; X64-NEXT: vpsraw $1, %ymm0, %ymm0
181 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
185 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
186 ; X32-LABEL: test_sraw_3:
187 ; X32: # BB#0: # %entry
188 ; X32-NEXT: vpsraw $15, %ymm0, %ymm0
191 ; X64-LABEL: test_sraw_3:
192 ; X64: # BB#0: # %entry
193 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0
196 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
200 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
201 ; X32-LABEL: test_srad_1:
202 ; X32: # BB#0: # %entry
205 ; X64-LABEL: test_srad_1:
206 ; X64: # BB#0: # %entry
209 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
213 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
214 ; X32-LABEL: test_srad_2:
215 ; X32: # BB#0: # %entry
216 ; X32-NEXT: vpsrad $1, %ymm0, %ymm0
219 ; X64-LABEL: test_srad_2:
220 ; X64: # BB#0: # %entry
221 ; X64-NEXT: vpsrad $1, %ymm0, %ymm0
224 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
228 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
229 ; X32-LABEL: test_srad_3:
230 ; X32: # BB#0: # %entry
231 ; X32-NEXT: vpsrad $31, %ymm0, %ymm0
234 ; X64-LABEL: test_srad_3:
235 ; X64: # BB#0: # %entry
236 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
239 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
243 ; SSE Logical Shift Right
245 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
246 ; X32-LABEL: test_srlw_1:
247 ; X32: # BB#0: # %entry
250 ; X64-LABEL: test_srlw_1:
251 ; X64: # BB#0: # %entry
254 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
258 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
259 ; X32-LABEL: test_srlw_2:
260 ; X32: # BB#0: # %entry
261 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm0
264 ; X64-LABEL: test_srlw_2:
265 ; X64: # BB#0: # %entry
266 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm0
269 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
273 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
274 ; X32-LABEL: test_srlw_3:
275 ; X32: # BB#0: # %entry
276 ; X32-NEXT: vpsrlw $15, %ymm0, %ymm0
279 ; X64-LABEL: test_srlw_3:
280 ; X64: # BB#0: # %entry
281 ; X64-NEXT: vpsrlw $15, %ymm0, %ymm0
284 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
288 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
289 ; X32-LABEL: test_srld_1:
290 ; X32: # BB#0: # %entry
293 ; X64-LABEL: test_srld_1:
294 ; X64: # BB#0: # %entry
297 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
301 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
302 ; X32-LABEL: test_srld_2:
303 ; X32: # BB#0: # %entry
304 ; X32-NEXT: vpsrld $1, %ymm0, %ymm0
307 ; X64-LABEL: test_srld_2:
308 ; X64: # BB#0: # %entry
309 ; X64-NEXT: vpsrld $1, %ymm0, %ymm0
312 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
316 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
317 ; X32-LABEL: test_srld_3:
318 ; X32: # BB#0: # %entry
319 ; X32-NEXT: vpsrld $31, %ymm0, %ymm0
322 ; X64-LABEL: test_srld_3:
323 ; X64: # BB#0: # %entry
324 ; X64-NEXT: vpsrld $31, %ymm0, %ymm0
327 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
331 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
332 ; X32-LABEL: test_srlq_1:
333 ; X32: # BB#0: # %entry
336 ; X64-LABEL: test_srlq_1:
337 ; X64: # BB#0: # %entry
340 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
344 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
345 ; X32-LABEL: test_srlq_2:
346 ; X32: # BB#0: # %entry
347 ; X32-NEXT: vpsrlq $1, %ymm0, %ymm0
350 ; X64-LABEL: test_srlq_2:
351 ; X64: # BB#0: # %entry
352 ; X64-NEXT: vpsrlq $1, %ymm0, %ymm0
355 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
359 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
360 ; X32-LABEL: test_srlq_3:
361 ; X32: # BB#0: # %entry
362 ; X32-NEXT: vpsrlq $63, %ymm0, %ymm0
365 ; X64-LABEL: test_srlq_3:
366 ; X64: # BB#0: # %entry
367 ; X64-NEXT: vpsrlq $63, %ymm0, %ymm0
370 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
374 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
375 ; X32-LABEL: srl_trunc_and_v4i64:
377 ; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
378 ; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
379 ; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
380 ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
381 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
382 ; X32-NEXT: vzeroupper
385 ; X64-LABEL: srl_trunc_and_v4i64:
387 ; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
388 ; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
389 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
390 ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
391 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
392 ; X64-NEXT: vzeroupper
394 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
395 %trunc = trunc <4 x i64> %and to <4 x i32>
396 %sra = lshr <4 x i32> %x, %trunc
401 ; Vectorized byte shifts
404 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
405 ; X32-LABEL: shl_8i16:
407 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
408 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
409 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
410 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
411 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
412 ; X32-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
413 ; X32-NEXT: vzeroupper
416 ; X64-LABEL: shl_8i16:
418 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
419 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
420 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
421 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
422 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
423 ; X64-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
424 ; X64-NEXT: vzeroupper
426 %shl = shl <8 x i16> %r, %a
430 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
431 ; X32-LABEL: shl_16i16:
433 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
434 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
435 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
436 ; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
437 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
438 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
439 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
440 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
441 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
442 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
445 ; X64-LABEL: shl_16i16:
447 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
448 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
449 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
450 ; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
451 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
452 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
453 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
454 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
455 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
456 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
458 %shl = shl <16 x i16> %r, %a
462 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
463 ; X32-LABEL: shl_32i8:
465 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
466 ; X32-NEXT: vpsllw $4, %ymm0, %ymm2
467 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
468 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
469 ; X32-NEXT: vpsllw $2, %ymm0, %ymm2
470 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
471 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
472 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
473 ; X32-NEXT: vpaddb %ymm0, %ymm0, %ymm2
474 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
475 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
478 ; X64-LABEL: shl_32i8:
480 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
481 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
482 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
483 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
484 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
485 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
486 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
487 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
488 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
489 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
490 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
492 %shl = shl <32 x i8> %r, %a
496 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
497 ; X32-LABEL: ashr_8i16:
499 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
500 ; X32-NEXT: vpmovsxwd %xmm0, %ymm0
501 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
502 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
503 ; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
504 ; X32-NEXT: vzeroupper
507 ; X64-LABEL: ashr_8i16:
509 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
510 ; X64-NEXT: vpmovsxwd %xmm0, %ymm0
511 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
512 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
513 ; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
514 ; X64-NEXT: vzeroupper
516 %ashr = ashr <8 x i16> %r, %a
520 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
521 ; X32-LABEL: ashr_16i16:
523 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
524 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
525 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
526 ; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3
527 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
528 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
529 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
530 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
531 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
532 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
535 ; X64-LABEL: ashr_16i16:
537 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
538 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
539 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
540 ; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
541 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
542 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
543 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
544 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
545 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
546 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
548 %ashr = ashr <16 x i16> %r, %a
552 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
553 ; X32-LABEL: ashr_32i8:
555 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
556 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
557 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
558 ; X32-NEXT: vpsraw $4, %ymm3, %ymm4
559 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
560 ; X32-NEXT: vpsraw $2, %ymm3, %ymm4
561 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
562 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
563 ; X32-NEXT: vpsraw $1, %ymm3, %ymm4
564 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
565 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
566 ; X32-NEXT: vpsrlw $8, %ymm2, %ymm2
567 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
568 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
569 ; X32-NEXT: vpsraw $4, %ymm0, %ymm3
570 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
571 ; X32-NEXT: vpsraw $2, %ymm0, %ymm3
572 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
573 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
574 ; X32-NEXT: vpsraw $1, %ymm0, %ymm3
575 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
576 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
577 ; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
578 ; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
581 ; X64-LABEL: ashr_32i8:
583 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
584 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
585 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
586 ; X64-NEXT: vpsraw $4, %ymm3, %ymm4
587 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
588 ; X64-NEXT: vpsraw $2, %ymm3, %ymm4
589 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
590 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
591 ; X64-NEXT: vpsraw $1, %ymm3, %ymm4
592 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
593 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
594 ; X64-NEXT: vpsrlw $8, %ymm2, %ymm2
595 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
596 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
597 ; X64-NEXT: vpsraw $4, %ymm0, %ymm3
598 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
599 ; X64-NEXT: vpsraw $2, %ymm0, %ymm3
600 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
601 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
602 ; X64-NEXT: vpsraw $1, %ymm0, %ymm3
603 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
604 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
605 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
606 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
608 %ashr = ashr <32 x i8> %r, %a
612 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
613 ; X32-LABEL: lshr_8i16:
615 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
616 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
617 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
618 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
619 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
620 ; X32-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
621 ; X32-NEXT: vzeroupper
624 ; X64-LABEL: lshr_8i16:
626 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
627 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
628 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
629 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
630 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
631 ; X64-NEXT: # kill: %xmm0<def> %xmm0<kill> %ymm0<kill>
632 ; X64-NEXT: vzeroupper
634 %lshr = lshr <8 x i16> %r, %a
638 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
639 ; X32-LABEL: lshr_16i16:
641 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
642 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
643 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
644 ; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
645 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
646 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
647 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
648 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
649 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
650 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
653 ; X64-LABEL: lshr_16i16:
655 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
656 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
657 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
658 ; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
659 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
660 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
661 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
662 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
663 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
664 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
666 %lshr = lshr <16 x i16> %r, %a
670 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
671 ; X32-LABEL: lshr_32i8:
673 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
674 ; X32-NEXT: vpsrlw $4, %ymm0, %ymm2
675 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
676 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
677 ; X32-NEXT: vpsrlw $2, %ymm0, %ymm2
678 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
679 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
680 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
681 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm2
682 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
683 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
684 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
687 ; X64-LABEL: lshr_32i8:
689 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
690 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
691 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
692 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
693 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
694 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
695 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
696 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
697 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
698 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
699 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
700 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
702 %lshr = lshr <32 x i8> %r, %a