1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-ALL
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-ALL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-PERLANE
9 ; AVX2 Logical Shift Left
11 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
12 ; CHECK-LABEL: test_sllw_1:
13 ; CHECK: # %bb.0: # %entry
14 ; CHECK-NEXT: ret{{[l|q]}}
16 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
20 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
21 ; CHECK-LABEL: test_sllw_2:
22 ; CHECK: # %bb.0: # %entry
23 ; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
24 ; CHECK-NEXT: ret{{[l|q]}}
26 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
30 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
31 ; CHECK-LABEL: test_sllw_3:
32 ; CHECK: # %bb.0: # %entry
33 ; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
34 ; CHECK-NEXT: ret{{[l|q]}}
36 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
40 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
41 ; CHECK-LABEL: test_slld_1:
42 ; CHECK: # %bb.0: # %entry
43 ; CHECK-NEXT: ret{{[l|q]}}
45 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
49 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
50 ; CHECK-LABEL: test_slld_2:
51 ; CHECK: # %bb.0: # %entry
52 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
53 ; CHECK-NEXT: ret{{[l|q]}}
55 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59 define <8 x i32> @test_vpslld_var(i32 %shift) {
60 ; X86-LABEL: test_vpslld_var:
62 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
63 ; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
64 ; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0
67 ; X64-LABEL: test_vpslld_var:
69 ; X64-NEXT: vmovd %edi, %xmm0
70 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
71 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
73 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
74 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
78 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
79 ; CHECK-LABEL: test_slld_3:
80 ; CHECK: # %bb.0: # %entry
81 ; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
82 ; CHECK-NEXT: ret{{[l|q]}}
84 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
88 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
89 ; CHECK-LABEL: test_sllq_1:
90 ; CHECK: # %bb.0: # %entry
91 ; CHECK-NEXT: ret{{[l|q]}}
93 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
97 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
98 ; CHECK-LABEL: test_sllq_2:
99 ; CHECK: # %bb.0: # %entry
100 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
101 ; CHECK-NEXT: ret{{[l|q]}}
103 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
107 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
108 ; CHECK-LABEL: test_sllq_3:
109 ; CHECK: # %bb.0: # %entry
110 ; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
111 ; CHECK-NEXT: ret{{[l|q]}}
113 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
117 ; AVX2 Arithmetic Shift
119 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
120 ; CHECK-LABEL: test_sraw_1:
121 ; CHECK: # %bb.0: # %entry
122 ; CHECK-NEXT: ret{{[l|q]}}
124 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
128 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
129 ; CHECK-LABEL: test_sraw_2:
130 ; CHECK: # %bb.0: # %entry
131 ; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0
132 ; CHECK-NEXT: ret{{[l|q]}}
134 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
138 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
139 ; CHECK-LABEL: test_sraw_3:
140 ; CHECK: # %bb.0: # %entry
141 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
142 ; CHECK-NEXT: ret{{[l|q]}}
144 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
148 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
149 ; CHECK-LABEL: test_srad_1:
150 ; CHECK: # %bb.0: # %entry
151 ; CHECK-NEXT: ret{{[l|q]}}
153 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
157 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
158 ; CHECK-LABEL: test_srad_2:
159 ; CHECK: # %bb.0: # %entry
160 ; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0
161 ; CHECK-NEXT: ret{{[l|q]}}
163 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
167 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
168 ; CHECK-LABEL: test_srad_3:
169 ; CHECK: # %bb.0: # %entry
170 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
171 ; CHECK-NEXT: ret{{[l|q]}}
173 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
177 ; SSE Logical Shift Right
179 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
180 ; CHECK-LABEL: test_srlw_1:
181 ; CHECK: # %bb.0: # %entry
182 ; CHECK-NEXT: ret{{[l|q]}}
184 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
188 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
189 ; CHECK-LABEL: test_srlw_2:
190 ; CHECK: # %bb.0: # %entry
191 ; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0
192 ; CHECK-NEXT: ret{{[l|q]}}
194 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
198 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
199 ; CHECK-LABEL: test_srlw_3:
200 ; CHECK: # %bb.0: # %entry
201 ; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0
202 ; CHECK-NEXT: ret{{[l|q]}}
204 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
208 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
209 ; CHECK-LABEL: test_srld_1:
210 ; CHECK: # %bb.0: # %entry
211 ; CHECK-NEXT: ret{{[l|q]}}
213 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
217 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
218 ; CHECK-LABEL: test_srld_2:
219 ; CHECK: # %bb.0: # %entry
220 ; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0
221 ; CHECK-NEXT: ret{{[l|q]}}
223 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
227 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
228 ; CHECK-LABEL: test_srld_3:
229 ; CHECK: # %bb.0: # %entry
230 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
231 ; CHECK-NEXT: ret{{[l|q]}}
233 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
237 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
238 ; CHECK-LABEL: test_srlq_1:
239 ; CHECK: # %bb.0: # %entry
240 ; CHECK-NEXT: ret{{[l|q]}}
242 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
246 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
247 ; CHECK-LABEL: test_srlq_2:
248 ; CHECK: # %bb.0: # %entry
249 ; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0
250 ; CHECK-NEXT: ret{{[l|q]}}
252 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
256 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
257 ; CHECK-LABEL: test_srlq_3:
258 ; CHECK: # %bb.0: # %entry
259 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
260 ; CHECK-NEXT: ret{{[l|q]}}
262 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
266 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
267 ; X86-SLOW-LABEL: srl_trunc_and_v4i64:
269 ; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
270 ; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
271 ; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
272 ; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
273 ; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
274 ; X86-SLOW-NEXT: vzeroupper
275 ; X86-SLOW-NEXT: retl
277 ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
278 ; X86-FAST-ALL: # %bb.0:
279 ; X86-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
280 ; X86-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
281 ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
282 ; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
283 ; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
284 ; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
285 ; X86-FAST-ALL-NEXT: vzeroupper
286 ; X86-FAST-ALL-NEXT: retl
288 ; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
289 ; X86-FAST-PERLANE: # %bb.0:
290 ; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
291 ; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
292 ; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
293 ; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
294 ; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
295 ; X86-FAST-PERLANE-NEXT: vzeroupper
296 ; X86-FAST-PERLANE-NEXT: retl
298 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
300 ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
301 ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
302 ; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
303 ; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
304 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
305 ; X64-SLOW-NEXT: vzeroupper
306 ; X64-SLOW-NEXT: retq
308 ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
309 ; X64-FAST-ALL: # %bb.0:
310 ; X64-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
311 ; X64-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1]
312 ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
313 ; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
314 ; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
315 ; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
316 ; X64-FAST-ALL-NEXT: vzeroupper
317 ; X64-FAST-ALL-NEXT: retq
319 ; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
320 ; X64-FAST-PERLANE: # %bb.0:
321 ; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
322 ; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
323 ; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
324 ; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
325 ; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
326 ; X64-FAST-PERLANE-NEXT: vzeroupper
327 ; X64-FAST-PERLANE-NEXT: retq
328 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
329 %trunc = trunc <4 x i64> %and to <4 x i32>
330 %sra = lshr <4 x i32> %x, %trunc
335 ; Vectorized byte shifts
338 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
339 ; CHECK-LABEL: shl_8i16:
341 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
342 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
343 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
344 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
345 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
346 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
347 ; CHECK-NEXT: vzeroupper
348 ; CHECK-NEXT: ret{{[l|q]}}
349 %shl = shl <8 x i16> %r, %a
353 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
354 ; CHECK-LABEL: shl_16i16:
356 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
357 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
358 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
359 ; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
360 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
361 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
362 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
363 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
364 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
365 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
366 ; CHECK-NEXT: ret{{[l|q]}}
367 %shl = shl <16 x i16> %r, %a
371 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
372 ; X86-LABEL: shl_32i8:
374 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
375 ; X86-NEXT: vpsllw $4, %ymm0, %ymm2
376 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
377 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
378 ; X86-NEXT: vpsllw $2, %ymm0, %ymm2
379 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
380 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
381 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
382 ; X86-NEXT: vpaddb %ymm0, %ymm0, %ymm2
383 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
384 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387 ; X64-LABEL: shl_32i8:
389 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
390 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
391 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
392 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
393 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
394 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
395 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
396 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
397 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
398 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
399 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
401 %shl = shl <32 x i8> %r, %a
405 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
406 ; CHECK-LABEL: ashr_8i16:
408 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
409 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
410 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
411 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
412 ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
413 ; CHECK-NEXT: vzeroupper
414 ; CHECK-NEXT: ret{{[l|q]}}
415 %ashr = ashr <8 x i16> %r, %a
419 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
420 ; CHECK-LABEL: ashr_16i16:
422 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
423 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
424 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
425 ; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
426 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
427 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
428 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
429 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
430 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
431 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
432 ; CHECK-NEXT: ret{{[l|q]}}
433 %ashr = ashr <16 x i16> %r, %a
437 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
438 ; CHECK-LABEL: ashr_32i8:
440 ; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
441 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
442 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
443 ; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
444 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
445 ; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
446 ; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
447 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
448 ; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
449 ; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
450 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
451 ; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
452 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
453 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
454 ; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
455 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
456 ; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
457 ; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
458 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
459 ; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
460 ; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
461 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
462 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
463 ; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
464 ; CHECK-NEXT: ret{{[l|q]}}
465 %ashr = ashr <32 x i8> %r, %a
469 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
470 ; CHECK-LABEL: lshr_8i16:
472 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
473 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
474 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
475 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
476 ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
477 ; CHECK-NEXT: vzeroupper
478 ; CHECK-NEXT: ret{{[l|q]}}
479 %lshr = lshr <8 x i16> %r, %a
483 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
484 ; CHECK-LABEL: lshr_16i16:
486 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
487 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
488 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
489 ; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
490 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
491 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
492 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
493 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
494 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
495 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
496 ; CHECK-NEXT: ret{{[l|q]}}
497 %lshr = lshr <16 x i16> %r, %a
501 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
502 ; X86-LABEL: lshr_32i8:
504 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
505 ; X86-NEXT: vpsrlw $4, %ymm0, %ymm2
506 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
507 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
508 ; X86-NEXT: vpsrlw $2, %ymm0, %ymm2
509 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
510 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
511 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
512 ; X86-NEXT: vpsrlw $1, %ymm0, %ymm2
513 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
514 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
515 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
518 ; X64-LABEL: lshr_32i8:
520 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
521 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
522 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
523 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
524 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
525 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
526 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
527 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
528 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
529 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
530 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
531 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
533 %lshr = lshr <32 x i8> %r, %a