1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,X86-SLOW
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-ALL
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X86,X86-FAST-PERLANE
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,X64-SLOW
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-ALL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,X64,X64-FAST-PERLANE
9 ; AVX2 Logical Shift Left
11 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
12 ; CHECK-LABEL: test_sllw_1:
13 ; CHECK: # %bb.0: # %entry
14 ; CHECK-NEXT: ret{{[l|q]}}
16 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
20 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
21 ; CHECK-LABEL: test_sllw_2:
22 ; CHECK: # %bb.0: # %entry
23 ; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
24 ; CHECK-NEXT: ret{{[l|q]}}
26 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
30 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
31 ; CHECK-LABEL: test_sllw_3:
32 ; CHECK: # %bb.0: # %entry
33 ; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
34 ; CHECK-NEXT: ret{{[l|q]}}
36 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
40 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
41 ; CHECK-LABEL: test_slld_1:
42 ; CHECK: # %bb.0: # %entry
43 ; CHECK-NEXT: ret{{[l|q]}}
45 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
49 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
50 ; CHECK-LABEL: test_slld_2:
51 ; CHECK: # %bb.0: # %entry
52 ; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
53 ; CHECK-NEXT: ret{{[l|q]}}
55 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
59 define <8 x i32> @test_vpslld_var(i32 %shift) {
60 ; X86-LABEL: test_vpslld_var:
62 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
63 ; X86-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
64 ; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0
67 ; X64-LABEL: test_vpslld_var:
69 ; X64-NEXT: vmovd %edi, %xmm0
70 ; X64-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
71 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
73 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
74 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
78 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
79 ; CHECK-LABEL: test_slld_3:
80 ; CHECK: # %bb.0: # %entry
81 ; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
82 ; CHECK-NEXT: ret{{[l|q]}}
84 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
88 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
89 ; CHECK-LABEL: test_sllq_1:
90 ; CHECK: # %bb.0: # %entry
91 ; CHECK-NEXT: ret{{[l|q]}}
93 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
97 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
98 ; CHECK-LABEL: test_sllq_2:
99 ; CHECK: # %bb.0: # %entry
100 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
101 ; CHECK-NEXT: ret{{[l|q]}}
103 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
107 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
108 ; CHECK-LABEL: test_sllq_3:
109 ; CHECK: # %bb.0: # %entry
110 ; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
111 ; CHECK-NEXT: ret{{[l|q]}}
113 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
117 ; AVX2 Arithmetic Shift
119 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
120 ; CHECK-LABEL: test_sraw_1:
121 ; CHECK: # %bb.0: # %entry
122 ; CHECK-NEXT: ret{{[l|q]}}
124 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
128 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
129 ; CHECK-LABEL: test_sraw_2:
130 ; CHECK: # %bb.0: # %entry
131 ; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0
132 ; CHECK-NEXT: ret{{[l|q]}}
134 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
138 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
139 ; CHECK-LABEL: test_sraw_3:
140 ; CHECK: # %bb.0: # %entry
141 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
142 ; CHECK-NEXT: ret{{[l|q]}}
144 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
148 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
149 ; CHECK-LABEL: test_srad_1:
150 ; CHECK: # %bb.0: # %entry
151 ; CHECK-NEXT: ret{{[l|q]}}
153 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
157 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
158 ; CHECK-LABEL: test_srad_2:
159 ; CHECK: # %bb.0: # %entry
160 ; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0
161 ; CHECK-NEXT: ret{{[l|q]}}
163 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
167 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
168 ; CHECK-LABEL: test_srad_3:
169 ; CHECK: # %bb.0: # %entry
170 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
171 ; CHECK-NEXT: ret{{[l|q]}}
173 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
177 ; SSE Logical Shift Right
179 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
180 ; CHECK-LABEL: test_srlw_1:
181 ; CHECK: # %bb.0: # %entry
182 ; CHECK-NEXT: ret{{[l|q]}}
184 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
188 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
189 ; CHECK-LABEL: test_srlw_2:
190 ; CHECK: # %bb.0: # %entry
191 ; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0
192 ; CHECK-NEXT: ret{{[l|q]}}
194 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
198 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
199 ; CHECK-LABEL: test_srlw_3:
200 ; CHECK: # %bb.0: # %entry
201 ; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0
202 ; CHECK-NEXT: ret{{[l|q]}}
204 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
208 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
209 ; CHECK-LABEL: test_srld_1:
210 ; CHECK: # %bb.0: # %entry
211 ; CHECK-NEXT: ret{{[l|q]}}
213 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
217 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
218 ; CHECK-LABEL: test_srld_2:
219 ; CHECK: # %bb.0: # %entry
220 ; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0
221 ; CHECK-NEXT: ret{{[l|q]}}
223 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
227 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
228 ; CHECK-LABEL: test_srld_3:
229 ; CHECK: # %bb.0: # %entry
230 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
231 ; CHECK-NEXT: ret{{[l|q]}}
233 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
237 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
238 ; CHECK-LABEL: test_srlq_1:
239 ; CHECK: # %bb.0: # %entry
240 ; CHECK-NEXT: ret{{[l|q]}}
242 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
246 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
247 ; CHECK-LABEL: test_srlq_2:
248 ; CHECK: # %bb.0: # %entry
249 ; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0
250 ; CHECK-NEXT: ret{{[l|q]}}
252 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
256 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
257 ; CHECK-LABEL: test_srlq_3:
258 ; CHECK: # %bb.0: # %entry
259 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
260 ; CHECK-NEXT: ret{{[l|q]}}
262 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
266 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
267 ; X86-SLOW-LABEL: srl_trunc_and_v4i64:
269 ; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
270 ; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
271 ; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
272 ; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
273 ; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
274 ; X86-SLOW-NEXT: vzeroupper
275 ; X86-SLOW-NEXT: retl
277 ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64:
278 ; X86-FAST-ALL: # %bb.0:
279 ; X86-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
280 ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
281 ; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
282 ; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
283 ; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
284 ; X86-FAST-ALL-NEXT: vzeroupper
285 ; X86-FAST-ALL-NEXT: retl
287 ; X86-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
288 ; X86-FAST-PERLANE: # %bb.0:
289 ; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
290 ; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
291 ; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
292 ; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
293 ; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
294 ; X86-FAST-PERLANE-NEXT: vzeroupper
295 ; X86-FAST-PERLANE-NEXT: retl
297 ; X64-SLOW-LABEL: srl_trunc_and_v4i64:
299 ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
300 ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
301 ; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
302 ; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
303 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
304 ; X64-SLOW-NEXT: vzeroupper
305 ; X64-SLOW-NEXT: retq
307 ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64:
308 ; X64-FAST-ALL: # %bb.0:
309 ; X64-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
310 ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1
311 ; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
312 ; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1
313 ; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
314 ; X64-FAST-ALL-NEXT: vzeroupper
315 ; X64-FAST-ALL-NEXT: retq
317 ; X64-FAST-PERLANE-LABEL: srl_trunc_and_v4i64:
318 ; X64-FAST-PERLANE: # %bb.0:
319 ; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2
320 ; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
321 ; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
322 ; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1
323 ; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
324 ; X64-FAST-PERLANE-NEXT: vzeroupper
325 ; X64-FAST-PERLANE-NEXT: retq
326 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
327 %trunc = trunc <4 x i64> %and to <4 x i32>
328 %sra = lshr <4 x i32> %x, %trunc
333 ; Vectorized byte shifts
336 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
337 ; CHECK-LABEL: shl_8i16:
339 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
340 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
341 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
342 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
343 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
344 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
345 ; CHECK-NEXT: vzeroupper
346 ; CHECK-NEXT: ret{{[l|q]}}
347 %shl = shl <8 x i16> %r, %a
351 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
352 ; CHECK-LABEL: shl_16i16:
354 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
355 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
356 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
357 ; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
358 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
359 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
360 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
361 ; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
362 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
363 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
364 ; CHECK-NEXT: ret{{[l|q]}}
365 %shl = shl <16 x i16> %r, %a
369 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
370 ; X86-LABEL: shl_32i8:
372 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
373 ; X86-NEXT: vpsllw $4, %ymm0, %ymm2
374 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
375 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
376 ; X86-NEXT: vpsllw $2, %ymm0, %ymm2
377 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
378 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
379 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
380 ; X86-NEXT: vpaddb %ymm0, %ymm0, %ymm2
381 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
382 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
385 ; X64-LABEL: shl_32i8:
387 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
388 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
389 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
390 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
392 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
393 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
394 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
395 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
396 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
397 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
399 %shl = shl <32 x i8> %r, %a
403 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
404 ; CHECK-LABEL: ashr_8i16:
406 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
407 ; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
408 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
409 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
410 ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
411 ; CHECK-NEXT: vzeroupper
412 ; CHECK-NEXT: ret{{[l|q]}}
413 %ashr = ashr <8 x i16> %r, %a
417 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
418 ; CHECK-LABEL: ashr_16i16:
420 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
421 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
422 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
423 ; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
424 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
425 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
426 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
427 ; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
428 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
429 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
430 ; CHECK-NEXT: ret{{[l|q]}}
431 %ashr = ashr <16 x i16> %r, %a
435 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
436 ; CHECK-LABEL: ashr_32i8:
438 ; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
439 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
440 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
441 ; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
442 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
443 ; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
444 ; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
445 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
446 ; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
447 ; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
448 ; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
449 ; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
450 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
451 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
452 ; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
453 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
454 ; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
455 ; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
456 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
457 ; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
458 ; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
459 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
460 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
461 ; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
462 ; CHECK-NEXT: ret{{[l|q]}}
463 %ashr = ashr <32 x i8> %r, %a
467 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
468 ; CHECK-LABEL: lshr_8i16:
470 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
471 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
472 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
473 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
474 ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
475 ; CHECK-NEXT: vzeroupper
476 ; CHECK-NEXT: ret{{[l|q]}}
477 %lshr = lshr <8 x i16> %r, %a
481 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
482 ; CHECK-LABEL: lshr_16i16:
484 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
485 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
486 ; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
487 ; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
488 ; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
489 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
490 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
491 ; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
492 ; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
493 ; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
494 ; CHECK-NEXT: ret{{[l|q]}}
495 %lshr = lshr <16 x i16> %r, %a
499 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
500 ; X86-LABEL: lshr_32i8:
502 ; X86-NEXT: vpsllw $5, %ymm1, %ymm1
503 ; X86-NEXT: vpsrlw $4, %ymm0, %ymm2
504 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
505 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
506 ; X86-NEXT: vpsrlw $2, %ymm0, %ymm2
507 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
508 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
509 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
510 ; X86-NEXT: vpsrlw $1, %ymm0, %ymm2
511 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
512 ; X86-NEXT: vpaddb %ymm1, %ymm1, %ymm1
513 ; X86-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
516 ; X64-LABEL: lshr_32i8:
518 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
519 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
520 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
521 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
522 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
523 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
524 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
525 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
526 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
527 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
528 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
529 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
531 %lshr = lshr <32 x i8> %r, %a