1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX2,X86-AVX2
3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512,X86-AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX2,X64-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512,X64-AVX512
7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
13 ; CHECK-LABEL: combine_pshufb_pslldq:
15 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
16 ; CHECK-NEXT: ret{{[l|q]}}
17 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
18 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
22 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
23 ; CHECK-LABEL: combine_pshufb_psrldq:
25 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
26 ; CHECK-NEXT: ret{{[l|q]}}
27 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
28 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
32 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
33 ; CHECK-LABEL: combine_pshufb_vpermd:
35 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
36 ; CHECK-NEXT: ret{{[l|q]}}
37 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
38 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
39 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
43 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
44 ; CHECK-LABEL: combine_pshufb_vpermps:
46 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
49 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
50 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
54 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
55 ; CHECK-LABEL: combine_and_pshufb:
57 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
58 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
61 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
65 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
66 ; CHECK-LABEL: combine_pshufb_and:
68 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
69 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
70 ; CHECK-NEXT: ret{{[l|q]}}
71 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
72 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
76 define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
77 ; X86-LABEL: combine_permq_pshufb_as_vperm2i128:
79 ; X86-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
80 ; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
83 ; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
85 ; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
86 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
88 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
89 %2 = bitcast <4 x i64> %1 to <32 x i8>
90 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
91 %4 = bitcast <32 x i8> %3 to <4 x i64>
92 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
96 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
97 ; CHECK-LABEL: combine_as_vpermd:
99 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
100 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
101 ; CHECK-NEXT: ret{{[l|q]}}
102 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
103 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
104 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
108 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
109 ; CHECK-LABEL: combine_as_vpermps:
111 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
112 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
113 ; CHECK-NEXT: ret{{[l|q]}}
114 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
115 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
116 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
120 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
121 ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
123 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
124 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
125 ; CHECK-NEXT: ret{{[l|q]}}
126 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
127 %2 = bitcast <4 x i64> %1 to <32 x i8>
128 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
132 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
133 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
135 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
136 ; CHECK-NEXT: ret{{[l|q]}}
137 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
141 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
142 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
144 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
145 ; CHECK-NEXT: ret{{[l|q]}}
146 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
147 %2 = bitcast <4 x i64> %1 to <32 x i8>
148 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
149 %4 = bitcast <32 x i8> %3 to <8 x i32>
150 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
151 %6 = bitcast <8 x i32> %5 to <32 x i8>
155 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
156 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
158 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
159 ; CHECK-NEXT: ret{{[l|q]}}
160 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
164 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
165 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
167 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
168 ; CHECK-NEXT: ret{{[l|q]}}
169 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
170 %2 = bitcast <4 x i64> %1 to <32 x i8>
171 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
172 %4 = bitcast <32 x i8> %3 to <8 x i32>
173 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
174 %6 = bitcast <8 x i32> %5 to <32 x i8>
178 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
179 ; X86-LABEL: combine_pshufb_as_vpbroadcastd128:
181 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0
182 ; X86-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0
185 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
187 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0
188 ; X64-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
190 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
191 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
195 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
196 ; X86-LABEL: combine_permd_as_vpbroadcastd256:
198 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0
199 ; X86-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
202 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
204 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0
205 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
207 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
208 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
209 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
213 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
214 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
216 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
217 ; CHECK-NEXT: ret{{[l|q]}}
218 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
222 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
223 ; X86-LABEL: combine_permd_as_vpbroadcastq256:
225 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0
226 ; X86-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
229 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
231 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
232 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
234 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
235 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
236 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
240 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
241 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
243 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
244 ; CHECK-NEXT: ret{{[l|q]}}
245 %1 = bitcast <4 x float> %a to <16 x i8>
246 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
247 %3 = bitcast <16 x i8> %2 to <4 x float>
251 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
252 ; CHECK-LABEL: combine_permps_as_vpbroadcastss256:
254 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
255 ; CHECK-NEXT: ret{{[l|q]}}
256 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
257 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
261 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
262 ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256:
264 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
267 %2 = bitcast <4 x double> %1 to <8 x float>
268 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
269 %4 = bitcast <8 x float> %3 to <4 x double>
273 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
274 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
276 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
277 ; CHECK-NEXT: ret{{[l|q]}}
278 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
279 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
283 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
284 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
286 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
287 ; CHECK-NEXT: ret{{[l|q]}}
288 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
289 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
293 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
294 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
296 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
297 ; CHECK-NEXT: ret{{[l|q]}}
298 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
299 %2 = bitcast <4 x float> %1 to <16 x i8>
300 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
301 %4 = bitcast <16 x i8> %3 to <4 x float>
305 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
306 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
308 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
309 ; CHECK-NEXT: ret{{[l|q]}}
310 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
311 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
315 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
316 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
318 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
319 ; CHECK-NEXT: ret{{[l|q]}}
320 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
321 %2 = bitcast <4 x double> %1 to <8 x float>
322 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
323 %4 = bitcast <8 x float> %3 to <4 x double>
327 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
328 ; CHECK-LABEL: combine_permd_as_permq:
330 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
331 ; CHECK-NEXT: ret{{[l|q]}}
332 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
336 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
337 ; CHECK-LABEL: combine_permps_as_permpd:
339 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
340 ; CHECK-NEXT: ret{{[l|q]}}
341 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
345 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
346 ; CHECK-LABEL: combine_pshufb_as_zext:
348 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
349 ; CHECK-NEXT: ret{{[l|q]}}
350 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
351 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
352 %3 = bitcast <32 x i8> %2 to <4 x i64>
356 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
357 ; CHECK-LABEL: combine_pshufb_as_zext128:
359 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
360 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
361 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
362 ; CHECK-NEXT: ret{{[l|q]}}
363 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
364 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
365 %3 = bitcast <32 x i8> %2 to <4 x i64>
369 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
370 ; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
372 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
373 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
374 ; CHECK-NEXT: ret{{[l|q]}}
375 %1 = bitcast <4 x double> %a0 to <32 x i8>
376 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
377 %3 = bitcast <32 x i8> %2 to <4 x double>
381 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
382 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
384 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
385 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
386 ; CHECK-NEXT: ret{{[l|q]}}
387 %1 = bitcast <8 x float> %a0 to <32 x i8>
388 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
389 %3 = bitcast <32 x i8> %2 to <8 x float>
393 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
394 ; CHECK-LABEL: combine_pshufb_as_pslldq:
396 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
397 ; CHECK-NEXT: ret{{[l|q]}}
398 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
402 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
403 ; CHECK-LABEL: combine_pshufb_as_psrldq:
405 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
406 ; CHECK-NEXT: ret{{[l|q]}}
407 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
411 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
412 ; CHECK-LABEL: combine_pshufb_as_psrlw:
414 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
415 ; CHECK-NEXT: ret{{[l|q]}}
416 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
420 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
421 ; CHECK-LABEL: combine_pshufb_as_pslld:
423 ; CHECK-NEXT: vpslld $24, %ymm0, %ymm0
424 ; CHECK-NEXT: ret{{[l|q]}}
425 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
429 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
430 ; CHECK-LABEL: combine_pshufb_as_psrlq:
432 ; CHECK-NEXT: vpsrlq $40, %ymm0, %ymm0
433 ; CHECK-NEXT: ret{{[l|q]}}
434 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
438 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
439 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
441 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
442 ; CHECK-NEXT: ret{{[l|q]}}
443 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
447 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
448 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
450 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
451 ; CHECK-NEXT: ret{{[l|q]}}
452 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
456 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
457 ; CHECK-LABEL: combine_pshufb_not_as_pshufw:
459 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
462 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
466 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
467 ; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
469 ; CHECK-NEXT: ret{{[l|q]}}
470 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
471 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
475 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
476 ; CHECK-LABEL: combine_pshufb_as_unpacklo_zero:
478 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
479 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
480 ; CHECK-NEXT: ret{{[l|q]}}
481 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
485 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
486 ; CHECK-LABEL: combine_pshufb_as_unpackhi_zero:
488 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
489 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
490 ; CHECK-NEXT: ret{{[l|q]}}
491 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
495 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
496 ; X86-LABEL: combine_psrlw_pshufb:
498 ; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
501 ; X64-LABEL: combine_psrlw_pshufb:
503 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
505 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
506 %2 = bitcast <16 x i16> %1 to <32 x i8>
507 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
511 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
512 ; X86-LABEL: combine_pslld_pshufb:
514 ; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
517 ; X64-LABEL: combine_pslld_pshufb:
519 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
521 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
522 %2 = bitcast <8 x i32> %1 to <32 x i8>
523 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
527 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
528 ; CHECK-LABEL: combine_psrlq_pshufb:
530 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
531 ; CHECK-NEXT: ret{{[l|q]}}
532 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
533 %2 = bitcast <4 x i64> %1 to <32 x i8>
534 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
538 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
539 ; CHECK-LABEL: combine_unpack_unpack_pshufb:
541 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
542 ; CHECK-NEXT: ret{{[l|q]}}
543 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
544 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
545 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
546 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
547 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
548 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
552 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
553 ; CHECK-LABEL: shuffle_combine_packssdw_pshufb:
555 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
556 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
557 ; CHECK-NEXT: ret{{[l|q]}}
558 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
559 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
560 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
563 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
565 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
566 ; CHECK-LABEL: shuffle_combine_packsswb_pshufb:
568 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
569 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
570 ; CHECK-NEXT: ret{{[l|q]}}
571 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
572 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
573 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
574 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
577 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
579 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
580 ; CHECK-LABEL: shuffle_combine_packusdw_pshufb:
582 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
583 ; CHECK-NEXT: ret{{[l|q]}}
584 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
585 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
586 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
589 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
591 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
592 ; CHECK-LABEL: shuffle_combine_packuswb_pshufb:
594 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
595 ; CHECK-NEXT: ret{{[l|q]}}
596 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
597 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
598 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
599 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
602 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
604 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
605 ; CHECK-LABEL: combine_pshufb_as_packsswb:
607 ; CHECK-NEXT: vpsraw $11, %ymm0, %ymm0
608 ; CHECK-NEXT: vpsraw $11, %ymm1, %ymm1
609 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
610 ; CHECK-NEXT: ret{{[l|q]}}
611 %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
612 %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
613 %3 = bitcast <16 x i16> %1 to <32 x i8>
614 %4 = bitcast <16 x i16> %2 to <32 x i8>
615 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
616 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
617 %7 = or <32 x i8> %5, %6
621 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
622 ; CHECK-LABEL: combine_pshufb_as_packuswb:
624 ; CHECK-NEXT: vpsrlw $11, %ymm0, %ymm0
625 ; CHECK-NEXT: vpsrlw $11, %ymm1, %ymm1
626 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
627 ; CHECK-NEXT: ret{{[l|q]}}
628 %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
629 %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
630 %3 = bitcast <16 x i16> %1 to <32 x i8>
631 %4 = bitcast <16 x i16> %2 to <32 x i8>
632 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
633 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
634 %7 = or <32 x i8> %5, %6
638 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
639 ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
641 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
644 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
646 ; X64-NEXT: vmovq %rdi, %xmm0
647 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
649 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
650 %2 = bitcast <2 x i64> %1 to <16 x i8>
651 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
655 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
656 ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64:
658 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
659 ; X86-NEXT: vbroadcastsd %xmm0, %ymm0
662 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
664 ; X64-NEXT: vmovq %rdi, %xmm0
665 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
667 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
668 %2 = bitcast <4 x i64> %1 to <8 x i32>
669 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
673 define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
674 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend:
676 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
677 ; CHECK-NEXT: ret{{[l|q]}}
678 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
679 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
680 %3 = or <32 x i8> %1, %2
684 define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
685 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
687 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
688 ; CHECK-NEXT: ret{{[l|q]}}
689 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
690 %2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
691 %3 = or <32 x i8> %1, %2
695 define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
696 ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
698 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
699 ; CHECK-NEXT: ret{{[l|q]}}
700 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
701 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
702 %3 = or <32 x i8> %1, %2
703 %4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
707 define <8 x i32> @constant_fold_permd() {
708 ; CHECK-LABEL: constant_fold_permd:
710 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
711 ; CHECK-NEXT: ret{{[l|q]}}
712 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
716 define <8 x float> @constant_fold_permps() {
717 ; CHECK-LABEL: constant_fold_permps:
719 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
720 ; CHECK-NEXT: ret{{[l|q]}}
721 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
725 define <32 x i8> @constant_fold_pshufb_256() {
726 ; CHECK-LABEL: constant_fold_pshufb_256:
728 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
729 ; CHECK-NEXT: ret{{[l|q]}}
730 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
734 define <32 x i8> @PR27320(<8 x i32> %a0) {
735 ; CHECK-LABEL: PR27320:
737 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
738 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
739 ; CHECK-NEXT: ret{{[l|q]}}
740 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
741 %2 = bitcast <8 x i32> %1 to <32 x i8>
742 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
746 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
747 ; CHECK-LABEL: PR34577:
748 ; CHECK: # %bb.0: # %entry
749 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
750 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
751 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
752 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
753 ; CHECK-NEXT: vpermps %ymm1, %ymm2, %ymm1
754 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
755 ; CHECK-NEXT: ret{{[l|q]}}
757 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
758 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
759 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
760 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
761 ret <8 x float> %shuf2
764 define void @packss_zext_v8i1() {
765 ; X86-LABEL: packss_zext_v8i1:
767 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
768 ; X86-NEXT: vmovups %ymm0, (%eax)
769 ; X86-NEXT: vzeroupper
772 ; X64-LABEL: packss_zext_v8i1:
774 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
775 ; X64-NEXT: vmovups %ymm0, (%rax)
776 ; X64-NEXT: vzeroupper
778 %tmp0 = icmp sgt <8 x i32> undef, undef
779 %tmp1 = zext <8 x i1> %tmp0 to <8 x i32>
780 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
781 %tmp3 = trunc <16 x i32> %tmp2 to <16 x i16>
782 %tmp4 = add <16 x i16> zeroinitializer, %tmp3
783 %tmp6 = sext <16 x i16> %tmp4 to <16 x i32>
784 %tmp10 = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
785 %tmp11 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> %tmp10)
786 store <16 x i16> %tmp11, <16 x i16>* undef, align 2