1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX2
3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512
7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
13 ; CHECK-LABEL: combine_pshufb_pslldq:
15 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
16 ; CHECK-NEXT: ret{{[l|q]}}
17 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
18 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
22 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
23 ; CHECK-LABEL: combine_pshufb_psrldq:
25 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
26 ; CHECK-NEXT: ret{{[l|q]}}
27 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
28 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
32 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
33 ; CHECK-LABEL: combine_pshufb_vpermd:
35 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
36 ; CHECK-NEXT: ret{{[l|q]}}
37 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
38 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
39 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
43 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
44 ; CHECK-LABEL: combine_pshufb_vpermps:
46 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
49 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
50 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
54 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
55 ; CHECK-LABEL: combine_and_pshufb:
57 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
58 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
61 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
65 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
66 ; CHECK-LABEL: combine_pshufb_and:
68 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
69 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
70 ; CHECK-NEXT: ret{{[l|q]}}
71 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
72 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
76 define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) {
77 ; X86-LABEL: combine_permq_pshufb_as_vextracti128:
79 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
80 ; X86-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
83 ; X64-LABEL: combine_permq_pshufb_as_vextracti128:
85 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
86 ; X64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
88 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
89 %2 = bitcast <4 x i64> %1 to <32 x i8>
90 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
91 %4 = bitcast <32 x i8> %3 to <4 x i64>
92 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
96 define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
97 ; X86-LABEL: combine_permq_pshufb_as_vmovdqa:
99 ; X86-NEXT: vmovdqa %xmm0, %xmm0
100 ; X86-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
103 ; X64-LABEL: combine_permq_pshufb_as_vmovdqa:
105 ; X64-NEXT: vmovdqa %xmm0, %xmm0
106 ; X64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
108 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
109 %2 = bitcast <4 x i64> %1 to <32 x i8>
110 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
111 %4 = bitcast <32 x i8> %3 to <4 x i64>
112 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
116 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
117 ; CHECK-LABEL: combine_as_vpermd:
119 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
120 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
121 ; CHECK-NEXT: ret{{[l|q]}}
122 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
123 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
124 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
128 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
129 ; CHECK-LABEL: combine_as_vpermps:
131 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
132 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
133 ; CHECK-NEXT: ret{{[l|q]}}
134 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
135 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
136 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
140 define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) {
141 ; CHECK-LABEL: combine_permq_pshufb_as_vmovaps:
143 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
144 ; CHECK-NEXT: ret{{[l|q]}}
145 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
146 %2 = bitcast <4 x i64> %1 to <32 x i8>
147 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
151 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
152 ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
154 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
155 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
156 ; CHECK-NEXT: ret{{[l|q]}}
157 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
158 %2 = bitcast <4 x i64> %1 to <32 x i8>
159 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
163 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
164 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
166 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
167 ; CHECK-NEXT: ret{{[l|q]}}
168 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
172 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
173 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
175 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
176 ; CHECK-NEXT: ret{{[l|q]}}
177 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
178 %2 = bitcast <4 x i64> %1 to <32 x i8>
179 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
180 %4 = bitcast <32 x i8> %3 to <8 x i32>
181 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
182 %6 = bitcast <8 x i32> %5 to <32 x i8>
186 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
187 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
189 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
190 ; CHECK-NEXT: ret{{[l|q]}}
191 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
195 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
196 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
198 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
199 ; CHECK-NEXT: ret{{[l|q]}}
200 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
201 %2 = bitcast <4 x i64> %1 to <32 x i8>
202 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
203 %4 = bitcast <32 x i8> %3 to <8 x i32>
204 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
205 %6 = bitcast <8 x i32> %5 to <32 x i8>
209 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
210 ; X86-LABEL: combine_pshufb_as_vpbroadcastd128:
212 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0
213 ; X86-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
216 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
218 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0
219 ; X64-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
221 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
222 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
226 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
227 ; X86-LABEL: combine_permd_as_vpbroadcastd256:
229 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0
230 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
233 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
235 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0
236 ; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
238 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
239 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
240 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
244 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
245 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
247 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
248 ; CHECK-NEXT: ret{{[l|q]}}
249 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
253 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
254 ; X86-LABEL: combine_permd_as_vpbroadcastq256:
256 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0
257 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
260 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
262 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
263 ; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
265 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
266 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
267 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
271 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
272 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
274 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
275 ; CHECK-NEXT: ret{{[l|q]}}
276 %1 = bitcast <4 x float> %a to <16 x i8>
277 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
278 %3 = bitcast <16 x i8> %2 to <4 x float>
282 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
283 ; CHECK-LABEL: combine_permps_as_vpbroadcastss256:
285 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
286 ; CHECK-NEXT: ret{{[l|q]}}
287 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
288 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
292 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
293 ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256:
295 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
296 ; CHECK-NEXT: ret{{[l|q]}}
297 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
298 %2 = bitcast <4 x double> %1 to <8 x float>
299 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
300 %4 = bitcast <8 x float> %3 to <4 x double>
304 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
305 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
307 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
308 ; CHECK-NEXT: ret{{[l|q]}}
309 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
310 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
314 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
315 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
317 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
318 ; CHECK-NEXT: ret{{[l|q]}}
319 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
320 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
324 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
325 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
327 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
328 ; CHECK-NEXT: ret{{[l|q]}}
329 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
330 %2 = bitcast <4 x float> %1 to <16 x i8>
331 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
332 %4 = bitcast <16 x i8> %3 to <4 x float>
336 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
337 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
339 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
340 ; CHECK-NEXT: ret{{[l|q]}}
341 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
342 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
346 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
347 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
349 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
350 ; CHECK-NEXT: ret{{[l|q]}}
351 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
352 %2 = bitcast <4 x double> %1 to <8 x float>
353 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
354 %4 = bitcast <8 x float> %3 to <4 x double>
358 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
359 ; CHECK-LABEL: combine_permd_as_permq:
361 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
362 ; CHECK-NEXT: ret{{[l|q]}}
363 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
367 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
368 ; CHECK-LABEL: combine_permps_as_permpd:
370 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
376 define <8 x float> @combine_permps_as_vpermilps(<8 x float> %a, i32 %a1) {
377 ; CHECK-LABEL: combine_permps_as_vpermilps:
379 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,7,6,5,4]
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %1 = insertelement <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>, i32 %a1, i32 0
382 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> %1)
383 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
387 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
388 ; CHECK-LABEL: combine_pshufb_as_zext:
390 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
391 ; CHECK-NEXT: ret{{[l|q]}}
392 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
393 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
394 %3 = bitcast <32 x i8> %2 to <4 x i64>
398 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
399 ; CHECK-LABEL: combine_pshufb_as_zext128:
401 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0]
402 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
403 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
404 ; CHECK-NEXT: ret{{[l|q]}}
405 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
406 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
407 %3 = bitcast <32 x i8> %2 to <4 x i64>
411 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
412 ; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
414 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
415 ; CHECK-NEXT: ret{{[l|q]}}
416 %1 = bitcast <4 x double> %a0 to <32 x i8>
417 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
418 %3 = bitcast <32 x i8> %2 to <4 x double>
422 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
423 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
425 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
426 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
427 ; CHECK-NEXT: ret{{[l|q]}}
428 %1 = bitcast <8 x float> %a0 to <32 x i8>
429 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
430 %3 = bitcast <32 x i8> %2 to <8 x float>
434 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
435 ; CHECK-LABEL: combine_pshufb_as_pslldq:
437 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
438 ; CHECK-NEXT: ret{{[l|q]}}
439 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
443 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
444 ; CHECK-LABEL: combine_pshufb_as_psrldq:
446 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
447 ; CHECK-NEXT: ret{{[l|q]}}
448 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
452 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
453 ; CHECK-LABEL: combine_pshufb_as_psrlw:
455 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
456 ; CHECK-NEXT: ret{{[l|q]}}
457 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
461 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
462 ; CHECK-LABEL: combine_pshufb_as_pslld:
464 ; CHECK-NEXT: vpslld $24, %ymm0, %ymm0
465 ; CHECK-NEXT: ret{{[l|q]}}
466 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
470 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
471 ; CHECK-LABEL: combine_pshufb_as_psrlq:
473 ; CHECK-NEXT: vpsrlq $40, %ymm0, %ymm0
474 ; CHECK-NEXT: ret{{[l|q]}}
475 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
479 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
480 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
482 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
483 ; CHECK-NEXT: ret{{[l|q]}}
484 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
488 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
489 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
491 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
492 ; CHECK-NEXT: ret{{[l|q]}}
493 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
497 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
498 ; AVX2-LABEL: combine_pshufb_not_as_pshufw:
500 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
501 ; AVX2-NEXT: ret{{[l|q]}}
503 ; AVX512-LABEL: combine_pshufb_not_as_pshufw:
505 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
506 ; AVX512-NEXT: vprold $16, %zmm0, %zmm0
507 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
508 ; AVX512-NEXT: ret{{[l|q]}}
509 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
510 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
514 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
515 ; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
517 ; CHECK-NEXT: ret{{[l|q]}}
518 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
519 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
523 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
524 ; CHECK-LABEL: combine_pshufb_as_unpacklo_zero:
526 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
527 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
528 ; CHECK-NEXT: ret{{[l|q]}}
529 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
533 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
534 ; CHECK-LABEL: combine_pshufb_as_unpackhi_zero:
536 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
537 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
538 ; CHECK-NEXT: ret{{[l|q]}}
539 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
543 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
544 ; X86-LABEL: combine_psrlw_pshufb:
546 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
549 ; X64-LABEL: combine_psrlw_pshufb:
551 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
553 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
554 %2 = bitcast <16 x i16> %1 to <32 x i8>
555 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
559 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
560 ; X86-LABEL: combine_pslld_pshufb:
562 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
565 ; X64-LABEL: combine_pslld_pshufb:
567 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
569 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
570 %2 = bitcast <8 x i32> %1 to <32 x i8>
571 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
575 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
576 ; CHECK-LABEL: combine_psrlq_pshufb:
578 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
579 ; CHECK-NEXT: ret{{[l|q]}}
580 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
581 %2 = bitcast <4 x i64> %1 to <32 x i8>
582 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
586 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
587 ; CHECK-LABEL: combine_unpack_unpack_pshufb:
589 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
590 ; CHECK-NEXT: ret{{[l|q]}}
591 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
592 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
593 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
594 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
595 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
596 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
600 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
601 ; CHECK-LABEL: shuffle_combine_packssdw_pshufb:
603 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
604 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
605 ; CHECK-NEXT: ret{{[l|q]}}
606 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
607 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
608 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
611 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
613 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
614 ; CHECK-LABEL: shuffle_combine_packsswb_pshufb:
616 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
617 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
618 ; CHECK-NEXT: ret{{[l|q]}}
619 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
620 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
621 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
622 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
625 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
627 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
628 ; CHECK-LABEL: shuffle_combine_packusdw_pshufb:
630 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
631 ; CHECK-NEXT: ret{{[l|q]}}
632 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
633 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
634 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
637 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
639 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
640 ; CHECK-LABEL: shuffle_combine_packuswb_pshufb:
642 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
643 ; CHECK-NEXT: ret{{[l|q]}}
644 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
645 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
646 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
647 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
650 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
652 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
653 ; CHECK-LABEL: combine_pshufb_as_packsswb:
655 ; CHECK-NEXT: vpsraw $11, %ymm0, %ymm0
656 ; CHECK-NEXT: vpsraw $11, %ymm1, %ymm1
657 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
658 ; CHECK-NEXT: ret{{[l|q]}}
659 %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
660 %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
661 %3 = bitcast <16 x i16> %1 to <32 x i8>
662 %4 = bitcast <16 x i16> %2 to <32 x i8>
663 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
664 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
665 %7 = or <32 x i8> %5, %6
669 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
670 ; CHECK-LABEL: combine_pshufb_as_packuswb:
672 ; CHECK-NEXT: vpsrlw $11, %ymm0, %ymm0
673 ; CHECK-NEXT: vpsrlw $11, %ymm1, %ymm1
674 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
675 ; CHECK-NEXT: ret{{[l|q]}}
676 %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
677 %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
678 %3 = bitcast <16 x i16> %1 to <32 x i8>
679 %4 = bitcast <16 x i16> %2 to <32 x i8>
680 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
681 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
682 %7 = or <32 x i8> %5, %6
686 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
687 ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
689 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
692 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
694 ; X64-NEXT: vmovq %rdi, %xmm0
695 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
697 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
698 %2 = bitcast <2 x i64> %1 to <16 x i8>
699 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
703 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
704 ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64:
706 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
709 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
711 ; X64-NEXT: vmovq %rdi, %xmm0
712 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
714 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
715 %2 = bitcast <4 x i64> %1 to <8 x i32>
716 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
720 define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
721 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend:
723 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
724 ; CHECK-NEXT: ret{{[l|q]}}
725 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
726 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
727 %3 = or <32 x i8> %1, %2
731 define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
732 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
734 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
735 ; CHECK-NEXT: ret{{[l|q]}}
736 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
737 %2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
738 %3 = or <32 x i8> %1, %2
742 define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
743 ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
745 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
746 ; CHECK-NEXT: ret{{[l|q]}}
747 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
748 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
749 %3 = or <32 x i8> %1, %2
750 %4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
754 define <8 x i32> @constant_fold_permd() {
755 ; CHECK-LABEL: constant_fold_permd:
757 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
758 ; CHECK-NEXT: ret{{[l|q]}}
759 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
763 define <8 x float> @constant_fold_permps() {
764 ; CHECK-LABEL: constant_fold_permps:
766 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
767 ; CHECK-NEXT: ret{{[l|q]}}
768 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
772 define <32 x i8> @constant_fold_pshufb_256() {
773 ; CHECK-LABEL: constant_fold_pshufb_256:
775 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
776 ; CHECK-NEXT: ret{{[l|q]}}
777 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
781 define i32 @broadcast_v2i64_multiuse(i64* %p0) {
782 ; X86-LABEL: broadcast_v2i64_multiuse:
783 ; X86: # %bb.0: # %entry
784 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
785 ; X86-NEXT: movl (%eax), %eax
786 ; X86-NEXT: addl %eax, %eax
789 ; X64-LABEL: broadcast_v2i64_multiuse:
790 ; X64: # %bb.0: # %entry
791 ; X64-NEXT: movl (%rdi), %eax
792 ; X64-NEXT: addl %eax, %eax
795 %tmp = load i64, i64* %p0, align 8
796 %tmp1 = trunc i64 %tmp to i32
797 %tmp2 = insertelement <2 x i64> undef, i64 %tmp, i32 0
798 %tmp3 = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <2 x i32> zeroinitializer
799 %tmp4 = trunc <2 x i64> %tmp3 to <2 x i32>
800 %tmp5 = extractelement <2 x i32> %tmp4, i32 1
801 %tmp6 = add i32 %tmp1, %tmp5
805 define <32 x i8> @PR27320(<8 x i32> %a0) {
806 ; CHECK-LABEL: PR27320:
808 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
809 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,20,21,21,22,23,24,24,25,26,27,27,28,29,30,30,31]
810 ; CHECK-NEXT: ret{{[l|q]}}
811 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
812 %2 = bitcast <8 x i32> %1 to <32 x i8>
813 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
817 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
818 ; AVX2-LABEL: PR34577:
819 ; AVX2: # %bb.0: # %entry
820 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
821 ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
822 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
823 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
824 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
825 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
826 ; AVX2-NEXT: ret{{[l|q]}}
828 ; AVX512-LABEL: PR34577:
829 ; AVX512: # %bb.0: # %entry
830 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
831 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
832 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
833 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
834 ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2>
835 ; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
836 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
837 ; AVX512-NEXT: ret{{[l|q]}}
839 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
840 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
841 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
842 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
843 ret <8 x float> %shuf2
846 define void @packss_zext_v8i1() {
847 ; X86-LABEL: packss_zext_v8i1:
849 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
850 ; X86-NEXT: vmovups %ymm0, (%eax)
851 ; X86-NEXT: vzeroupper
854 ; X64-LABEL: packss_zext_v8i1:
856 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
857 ; X64-NEXT: vmovups %ymm0, (%rax)
858 ; X64-NEXT: vzeroupper
860 %tmp0 = icmp sgt <8 x i32> undef, undef
861 %tmp1 = zext <8 x i1> %tmp0 to <8 x i32>
862 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
863 %tmp3 = trunc <16 x i32> %tmp2 to <16 x i16>
864 %tmp4 = add <16 x i16> zeroinitializer, %tmp3
865 %tmp6 = sext <16 x i16> %tmp4 to <16 x i32>
866 %tmp10 = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
867 %tmp11 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> %tmp10)
868 store <16 x i16> %tmp11, <16 x i16>* undef, align 2