1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
7 ; AVX2 Logical Shift Left
9 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
10 ; X32-LABEL: test_sllw_1:
11 ; X32: # %bb.0: # %entry
14 ; X64-LABEL: test_sllw_1:
15 ; X64: # %bb.0: # %entry
18 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
22 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
23 ; X32-LABEL: test_sllw_2:
24 ; X32: # %bb.0: # %entry
25 ; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0
28 ; X64-LABEL: test_sllw_2:
29 ; X64: # %bb.0: # %entry
30 ; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0
33 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
37 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
38 ; X32-LABEL: test_sllw_3:
39 ; X32: # %bb.0: # %entry
40 ; X32-NEXT: vpsllw $15, %ymm0, %ymm0
43 ; X64-LABEL: test_sllw_3:
44 ; X64: # %bb.0: # %entry
45 ; X64-NEXT: vpsllw $15, %ymm0, %ymm0
48 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
52 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
53 ; X32-LABEL: test_slld_1:
54 ; X32: # %bb.0: # %entry
57 ; X64-LABEL: test_slld_1:
58 ; X64: # %bb.0: # %entry
61 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
65 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
66 ; X32-LABEL: test_slld_2:
67 ; X32: # %bb.0: # %entry
68 ; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
71 ; X64-LABEL: test_slld_2:
72 ; X64: # %bb.0: # %entry
73 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
76 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
80 define <8 x i32> @test_vpslld_var(i32 %shift) {
81 ; X32-LABEL: test_vpslld_var:
83 ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
84 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
85 ; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0
88 ; X64-LABEL: test_vpslld_var:
90 ; X64-NEXT: vmovd %edi, %xmm0
91 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
92 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
94 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
95 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
99 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
100 ; X32-LABEL: test_slld_3:
101 ; X32: # %bb.0: # %entry
102 ; X32-NEXT: vpslld $31, %ymm0, %ymm0
105 ; X64-LABEL: test_slld_3:
106 ; X64: # %bb.0: # %entry
107 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
110 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
114 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
115 ; X32-LABEL: test_sllq_1:
116 ; X32: # %bb.0: # %entry
119 ; X64-LABEL: test_sllq_1:
120 ; X64: # %bb.0: # %entry
123 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
127 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
128 ; X32-LABEL: test_sllq_2:
129 ; X32: # %bb.0: # %entry
130 ; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
133 ; X64-LABEL: test_sllq_2:
134 ; X64: # %bb.0: # %entry
135 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
138 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
142 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
143 ; X32-LABEL: test_sllq_3:
144 ; X32: # %bb.0: # %entry
145 ; X32-NEXT: vpsllq $63, %ymm0, %ymm0
148 ; X64-LABEL: test_sllq_3:
149 ; X64: # %bb.0: # %entry
150 ; X64-NEXT: vpsllq $63, %ymm0, %ymm0
153 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
157 ; AVX2 Arithmetic Shift
159 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
160 ; X32-LABEL: test_sraw_1:
161 ; X32: # %bb.0: # %entry
164 ; X64-LABEL: test_sraw_1:
165 ; X64: # %bb.0: # %entry
168 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
172 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
173 ; X32-LABEL: test_sraw_2:
174 ; X32: # %bb.0: # %entry
175 ; X32-NEXT: vpsraw $1, %ymm0, %ymm0
178 ; X64-LABEL: test_sraw_2:
179 ; X64: # %bb.0: # %entry
180 ; X64-NEXT: vpsraw $1, %ymm0, %ymm0
183 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
187 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
188 ; X32-LABEL: test_sraw_3:
189 ; X32: # %bb.0: # %entry
190 ; X32-NEXT: vpsraw $15, %ymm0, %ymm0
193 ; X64-LABEL: test_sraw_3:
194 ; X64: # %bb.0: # %entry
195 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0
198 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
202 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
203 ; X32-LABEL: test_srad_1:
204 ; X32: # %bb.0: # %entry
207 ; X64-LABEL: test_srad_1:
208 ; X64: # %bb.0: # %entry
211 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
215 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
216 ; X32-LABEL: test_srad_2:
217 ; X32: # %bb.0: # %entry
218 ; X32-NEXT: vpsrad $1, %ymm0, %ymm0
221 ; X64-LABEL: test_srad_2:
222 ; X64: # %bb.0: # %entry
223 ; X64-NEXT: vpsrad $1, %ymm0, %ymm0
226 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
230 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
231 ; X32-LABEL: test_srad_3:
232 ; X32: # %bb.0: # %entry
233 ; X32-NEXT: vpsrad $31, %ymm0, %ymm0
236 ; X64-LABEL: test_srad_3:
237 ; X64: # %bb.0: # %entry
238 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
241 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
245 ; SSE Logical Shift Right
247 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
248 ; X32-LABEL: test_srlw_1:
249 ; X32: # %bb.0: # %entry
252 ; X64-LABEL: test_srlw_1:
253 ; X64: # %bb.0: # %entry
256 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
260 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
261 ; X32-LABEL: test_srlw_2:
262 ; X32: # %bb.0: # %entry
263 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm0
266 ; X64-LABEL: test_srlw_2:
267 ; X64: # %bb.0: # %entry
268 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm0
271 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
275 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
276 ; X32-LABEL: test_srlw_3:
277 ; X32: # %bb.0: # %entry
278 ; X32-NEXT: vpsrlw $15, %ymm0, %ymm0
281 ; X64-LABEL: test_srlw_3:
282 ; X64: # %bb.0: # %entry
283 ; X64-NEXT: vpsrlw $15, %ymm0, %ymm0
286 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
290 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
291 ; X32-LABEL: test_srld_1:
292 ; X32: # %bb.0: # %entry
295 ; X64-LABEL: test_srld_1:
296 ; X64: # %bb.0: # %entry
299 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
303 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
304 ; X32-LABEL: test_srld_2:
305 ; X32: # %bb.0: # %entry
306 ; X32-NEXT: vpsrld $1, %ymm0, %ymm0
309 ; X64-LABEL: test_srld_2:
310 ; X64: # %bb.0: # %entry
311 ; X64-NEXT: vpsrld $1, %ymm0, %ymm0
314 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
318 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
319 ; X32-LABEL: test_srld_3:
320 ; X32: # %bb.0: # %entry
321 ; X32-NEXT: vpsrld $31, %ymm0, %ymm0
324 ; X64-LABEL: test_srld_3:
325 ; X64: # %bb.0: # %entry
326 ; X64-NEXT: vpsrld $31, %ymm0, %ymm0
329 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
333 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
334 ; X32-LABEL: test_srlq_1:
335 ; X32: # %bb.0: # %entry
338 ; X64-LABEL: test_srlq_1:
339 ; X64: # %bb.0: # %entry
342 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
346 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
347 ; X32-LABEL: test_srlq_2:
348 ; X32: # %bb.0: # %entry
349 ; X32-NEXT: vpsrlq $1, %ymm0, %ymm0
352 ; X64-LABEL: test_srlq_2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: vpsrlq $1, %ymm0, %ymm0
357 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
361 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
362 ; X32-LABEL: test_srlq_3:
363 ; X32: # %bb.0: # %entry
364 ; X32-NEXT: vpsrlq $63, %ymm0, %ymm0
367 ; X64-LABEL: test_srlq_3:
368 ; X64: # %bb.0: # %entry
369 ; X64-NEXT: vpsrlq $63, %ymm0, %ymm0
372 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
376 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
377 ; X32-SLOW-LABEL: srl_trunc_and_v4i64:
379 ; X32-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
380 ; X32-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
381 ; X32-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
382 ; X32-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
383 ; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
384 ; X32-SLOW-NEXT: vzeroupper
385 ; X32-SLOW-NEXT: retl
387 ; X32-FAST-LABEL: srl_trunc_and_v4i64:
389 ; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
390 ; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
391 ; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
392 ; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
393 ; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
394 ; X32-FAST-NEXT: vzeroupper
395 ; X32-FAST-NEXT: retl
397 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
399 ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
400 ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
401 ; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
402 ; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
403 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
404 ; X64-SLOW-NEXT: vzeroupper
405 ; X64-SLOW-NEXT: retq
407 ; X64-FAST-LABEL: srl_trunc_and_v4i64:
409 ; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
410 ; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
411 ; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
412 ; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
413 ; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
414 ; X64-FAST-NEXT: vzeroupper
415 ; X64-FAST-NEXT: retq
416 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
417 %trunc = trunc <4 x i64> %and to <4 x i32>
418 %sra = lshr <4 x i32> %x, %trunc
423 ; Vectorized byte shifts
426 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
427 ; X32-LABEL: shl_8i16:
429 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
430 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
431 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
432 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
433 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
434 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
435 ; X32-NEXT: vzeroupper
438 ; X64-LABEL: shl_8i16:
440 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
441 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
442 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
443 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
444 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
445 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
446 ; X64-NEXT: vzeroupper
448 %shl = shl <8 x i16> %r, %a
452 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
453 ; X32-LABEL: shl_16i16:
455 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
456 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
457 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
458 ; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
459 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
460 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
461 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
462 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
463 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
464 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
467 ; X64-LABEL: shl_16i16:
469 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
470 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
471 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
472 ; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
473 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
474 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
475 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
476 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
477 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
478 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
480 %shl = shl <16 x i16> %r, %a
484 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
485 ; X32-LABEL: shl_32i8:
487 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
488 ; X32-NEXT: vpsllw $4, %ymm0, %ymm2
489 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
490 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
491 ; X32-NEXT: vpsllw $2, %ymm0, %ymm2
492 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
493 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
494 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
495 ; X32-NEXT: vpaddb %ymm0, %ymm0, %ymm2
496 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
497 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
500 ; X64-LABEL: shl_32i8:
502 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
503 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
504 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
505 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
506 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
507 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
508 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
509 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
510 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
511 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
512 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
514 %shl = shl <32 x i8> %r, %a
518 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
519 ; X32-LABEL: ashr_8i16:
521 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
522 ; X32-NEXT: vpmovsxwd %xmm0, %ymm0
523 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
524 ; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
525 ; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
526 ; X32-NEXT: vzeroupper
529 ; X64-LABEL: ashr_8i16:
531 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
532 ; X64-NEXT: vpmovsxwd %xmm0, %ymm0
533 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
534 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
535 ; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
536 ; X64-NEXT: vzeroupper
538 %ashr = ashr <8 x i16> %r, %a
542 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
543 ; X32-LABEL: ashr_16i16:
545 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
546 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
547 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
548 ; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3
549 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
550 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
551 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
552 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
553 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
554 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
557 ; X64-LABEL: ashr_16i16:
559 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
560 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
561 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
562 ; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
563 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
564 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
565 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
566 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
567 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
568 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
570 %ashr = ashr <16 x i16> %r, %a
574 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
575 ; X32-LABEL: ashr_32i8:
577 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
578 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
579 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
580 ; X32-NEXT: vpsraw $4, %ymm3, %ymm4
581 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
582 ; X32-NEXT: vpsraw $2, %ymm3, %ymm4
583 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
584 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
585 ; X32-NEXT: vpsraw $1, %ymm3, %ymm4
586 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
587 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
588 ; X32-NEXT: vpsrlw $8, %ymm2, %ymm2
589 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
590 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
591 ; X32-NEXT: vpsraw $4, %ymm0, %ymm3
592 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
593 ; X32-NEXT: vpsraw $2, %ymm0, %ymm3
594 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
595 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
596 ; X32-NEXT: vpsraw $1, %ymm0, %ymm3
597 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
598 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
599 ; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
600 ; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
603 ; X64-LABEL: ashr_32i8:
605 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
606 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
607 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
608 ; X64-NEXT: vpsraw $4, %ymm3, %ymm4
609 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
610 ; X64-NEXT: vpsraw $2, %ymm3, %ymm4
611 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
612 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
613 ; X64-NEXT: vpsraw $1, %ymm3, %ymm4
614 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
615 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
616 ; X64-NEXT: vpsrlw $8, %ymm2, %ymm2
617 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
618 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
619 ; X64-NEXT: vpsraw $4, %ymm0, %ymm3
620 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
621 ; X64-NEXT: vpsraw $2, %ymm0, %ymm3
622 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
623 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
624 ; X64-NEXT: vpsraw $1, %ymm0, %ymm3
625 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
626 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
627 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
628 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
630 %ashr = ashr <32 x i8> %r, %a
634 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
635 ; X32-LABEL: lshr_8i16:
637 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
638 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
639 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
640 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
641 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
642 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
643 ; X32-NEXT: vzeroupper
646 ; X64-LABEL: lshr_8i16:
648 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
649 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
650 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
651 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
652 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
653 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
654 ; X64-NEXT: vzeroupper
656 %lshr = lshr <8 x i16> %r, %a
660 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
661 ; X32-LABEL: lshr_16i16:
663 ; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
664 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
665 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
666 ; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
667 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
668 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
669 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
670 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
671 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
672 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
675 ; X64-LABEL: lshr_16i16:
677 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
678 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
679 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
680 ; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
681 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
682 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
683 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
684 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
685 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
686 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
688 %lshr = lshr <16 x i16> %r, %a
692 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
693 ; X32-LABEL: lshr_32i8:
695 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
696 ; X32-NEXT: vpsrlw $4, %ymm0, %ymm2
697 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
698 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
699 ; X32-NEXT: vpsrlw $2, %ymm0, %ymm2
700 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
701 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
702 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
703 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm2
704 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
705 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
706 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
709 ; X64-LABEL: lshr_32i8:
711 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
712 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
713 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
714 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
715 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
716 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
717 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
718 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
719 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
720 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
721 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
722 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
724 %lshr = lshr <32 x i8> %r, %a