1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX2,X86-AVX2
3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512,X86-AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX2,X64-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512,X64-AVX512
7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
13 ; CHECK-LABEL: combine_pshufb_pslldq:
15 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
16 ; CHECK-NEXT: ret{{[l|q]}}
17 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
18 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
22 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
23 ; CHECK-LABEL: combine_pshufb_psrldq:
25 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
26 ; CHECK-NEXT: ret{{[l|q]}}
27 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
28 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
32 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
33 ; CHECK-LABEL: combine_pshufb_vpermd:
35 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
36 ; CHECK-NEXT: ret{{[l|q]}}
37 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
38 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
39 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
43 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
44 ; CHECK-LABEL: combine_pshufb_vpermps:
46 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
49 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
50 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
54 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
55 ; CHECK-LABEL: combine_and_pshufb:
57 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
58 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
61 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
65 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
66 ; CHECK-LABEL: combine_pshufb_and:
68 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
69 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
70 ; CHECK-NEXT: ret{{[l|q]}}
71 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
72 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
76 define <4 x i64> @combine_permq_pshufb_as_vextracti128(<4 x i64> %a0) {
77 ; X86-LABEL: combine_permq_pshufb_as_vextracti128:
79 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
80 ; X86-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
83 ; X64-LABEL: combine_permq_pshufb_as_vextracti128:
85 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
86 ; X64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
88 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
89 %2 = bitcast <4 x i64> %1 to <32 x i8>
90 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
91 %4 = bitcast <32 x i8> %3 to <4 x i64>
92 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
96 define <4 x i64> @combine_permq_pshufb_as_vmovdqa(<4 x i64> %a0) {
97 ; X86-LABEL: combine_permq_pshufb_as_vmovdqa:
99 ; X86-NEXT: vmovdqa %xmm0, %xmm0
100 ; X86-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
103 ; X64-LABEL: combine_permq_pshufb_as_vmovdqa:
105 ; X64-NEXT: vmovdqa %xmm0, %xmm0
106 ; X64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
108 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
109 %2 = bitcast <4 x i64> %1 to <32 x i8>
110 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
111 %4 = bitcast <32 x i8> %3 to <4 x i64>
112 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
116 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
117 ; CHECK-LABEL: combine_as_vpermd:
119 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
120 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
121 ; CHECK-NEXT: ret{{[l|q]}}
122 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
123 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
124 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
128 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
129 ; CHECK-LABEL: combine_as_vpermps:
131 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
132 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
133 ; CHECK-NEXT: ret{{[l|q]}}
134 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
135 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
136 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
140 define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) {
141 ; CHECK-LABEL: combine_permq_pshufb_as_vmovaps:
143 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
144 ; CHECK-NEXT: ret{{[l|q]}}
145 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
146 %2 = bitcast <4 x i64> %1 to <32 x i8>
147 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
151 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
152 ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
154 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
155 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
156 ; CHECK-NEXT: ret{{[l|q]}}
157 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
158 %2 = bitcast <4 x i64> %1 to <32 x i8>
159 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
163 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
164 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
166 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
167 ; CHECK-NEXT: ret{{[l|q]}}
168 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
172 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
173 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
175 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
176 ; CHECK-NEXT: ret{{[l|q]}}
177 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
178 %2 = bitcast <4 x i64> %1 to <32 x i8>
179 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
180 %4 = bitcast <32 x i8> %3 to <8 x i32>
181 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
182 %6 = bitcast <8 x i32> %5 to <32 x i8>
186 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
187 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
189 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
190 ; CHECK-NEXT: ret{{[l|q]}}
191 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
195 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
196 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
198 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
199 ; CHECK-NEXT: ret{{[l|q]}}
200 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
201 %2 = bitcast <4 x i64> %1 to <32 x i8>
202 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
203 %4 = bitcast <32 x i8> %3 to <8 x i32>
204 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
205 %6 = bitcast <8 x i32> %5 to <32 x i8>
209 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
210 ; X86-LABEL: combine_pshufb_as_vpbroadcastd128:
212 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0
213 ; X86-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
216 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
218 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0
219 ; X64-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
221 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
222 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
226 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
227 ; X86-LABEL: combine_permd_as_vpbroadcastd256:
229 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0
230 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
233 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
235 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0
236 ; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
238 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
239 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
240 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
244 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
245 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
247 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
248 ; CHECK-NEXT: ret{{[l|q]}}
249 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
253 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
254 ; X86-LABEL: combine_permd_as_vpbroadcastq256:
256 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0
257 ; X86-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
260 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
262 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
263 ; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
265 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
266 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
267 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
271 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
272 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
274 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
275 ; CHECK-NEXT: ret{{[l|q]}}
276 %1 = bitcast <4 x float> %a to <16 x i8>
277 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
278 %3 = bitcast <16 x i8> %2 to <4 x float>
282 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
283 ; CHECK-LABEL: combine_permps_as_vpbroadcastss256:
285 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
286 ; CHECK-NEXT: ret{{[l|q]}}
287 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
288 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
292 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
293 ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256:
295 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
296 ; CHECK-NEXT: ret{{[l|q]}}
297 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
298 %2 = bitcast <4 x double> %1 to <8 x float>
299 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
300 %4 = bitcast <8 x float> %3 to <4 x double>
304 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
305 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
307 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
308 ; CHECK-NEXT: ret{{[l|q]}}
309 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
310 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
314 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
315 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
317 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
318 ; CHECK-NEXT: ret{{[l|q]}}
319 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
320 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
324 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
325 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
327 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
328 ; CHECK-NEXT: ret{{[l|q]}}
329 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
330 %2 = bitcast <4 x float> %1 to <16 x i8>
331 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
332 %4 = bitcast <16 x i8> %3 to <4 x float>
336 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
337 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
339 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
340 ; CHECK-NEXT: ret{{[l|q]}}
341 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
342 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
346 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
347 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
349 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
350 ; CHECK-NEXT: ret{{[l|q]}}
351 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
352 %2 = bitcast <4 x double> %1 to <8 x float>
353 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
354 %4 = bitcast <8 x float> %3 to <4 x double>
358 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
359 ; CHECK-LABEL: combine_permd_as_permq:
361 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
362 ; CHECK-NEXT: ret{{[l|q]}}
363 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
367 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
368 ; CHECK-LABEL: combine_permps_as_permpd:
370 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
371 ; CHECK-NEXT: ret{{[l|q]}}
372 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
376 define <8 x float> @combine_permps_as_vpermilps(<8 x float> %a, i32 %a1) {
377 ; CHECK-LABEL: combine_permps_as_vpermilps:
379 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,7,6,5,4]
380 ; CHECK-NEXT: ret{{[l|q]}}
381 %1 = insertelement <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>, i32 %a1, i32 0
382 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> %1)
383 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
387 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
388 ; CHECK-LABEL: combine_pshufb_as_zext:
390 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
391 ; CHECK-NEXT: ret{{[l|q]}}
392 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
393 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
394 %3 = bitcast <32 x i8> %2 to <4 x i64>
398 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
399 ; CHECK-LABEL: combine_pshufb_as_zext128:
401 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0]
402 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
403 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
404 ; CHECK-NEXT: ret{{[l|q]}}
405 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
406 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
407 %3 = bitcast <32 x i8> %2 to <4 x i64>
411 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
412 ; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
414 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
415 ; CHECK-NEXT: ret{{[l|q]}}
416 %1 = bitcast <4 x double> %a0 to <32 x i8>
417 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
418 %3 = bitcast <32 x i8> %2 to <4 x double>
422 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
423 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
425 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
426 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
427 ; CHECK-NEXT: ret{{[l|q]}}
428 %1 = bitcast <8 x float> %a0 to <32 x i8>
429 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
430 %3 = bitcast <32 x i8> %2 to <8 x float>
434 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
435 ; CHECK-LABEL: combine_pshufb_as_pslldq:
437 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
438 ; CHECK-NEXT: ret{{[l|q]}}
439 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
443 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
444 ; CHECK-LABEL: combine_pshufb_as_psrldq:
446 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
447 ; CHECK-NEXT: ret{{[l|q]}}
448 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
452 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
453 ; CHECK-LABEL: combine_pshufb_as_psrlw:
455 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
456 ; CHECK-NEXT: ret{{[l|q]}}
457 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
461 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
462 ; CHECK-LABEL: combine_pshufb_as_pslld:
464 ; CHECK-NEXT: vpslld $24, %ymm0, %ymm0
465 ; CHECK-NEXT: ret{{[l|q]}}
466 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
470 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
471 ; CHECK-LABEL: combine_pshufb_as_psrlq:
473 ; CHECK-NEXT: vpsrlq $40, %ymm0, %ymm0
474 ; CHECK-NEXT: ret{{[l|q]}}
475 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
479 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
480 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
482 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
483 ; CHECK-NEXT: ret{{[l|q]}}
484 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
488 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
489 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
491 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
492 ; CHECK-NEXT: ret{{[l|q]}}
493 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
497 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
498 ; AVX2-LABEL: combine_pshufb_not_as_pshufw:
500 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
501 ; AVX2-NEXT: ret{{[l|q]}}
503 ; AVX512-LABEL: combine_pshufb_not_as_pshufw:
505 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
506 ; AVX512-NEXT: vprold $16, %zmm0, %zmm0
507 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
508 ; AVX512-NEXT: ret{{[l|q]}}
509 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
510 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
514 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
515 ; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
517 ; CHECK-NEXT: ret{{[l|q]}}
518 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
519 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
523 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
524 ; CHECK-LABEL: combine_pshufb_as_unpacklo_zero:
526 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
527 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
528 ; CHECK-NEXT: ret{{[l|q]}}
529 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
533 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
534 ; CHECK-LABEL: combine_pshufb_as_unpackhi_zero:
536 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
537 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
538 ; CHECK-NEXT: ret{{[l|q]}}
539 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
543 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
544 ; X86-LABEL: combine_psrlw_pshufb:
546 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
549 ; X64-LABEL: combine_psrlw_pshufb:
551 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
553 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
554 %2 = bitcast <16 x i16> %1 to <32 x i8>
555 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
559 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
560 ; X86-LABEL: combine_pslld_pshufb:
562 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
565 ; X64-LABEL: combine_pslld_pshufb:
567 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
569 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
570 %2 = bitcast <8 x i32> %1 to <32 x i8>
571 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
575 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
576 ; CHECK-LABEL: combine_psrlq_pshufb:
578 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
579 ; CHECK-NEXT: ret{{[l|q]}}
580 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
581 %2 = bitcast <4 x i64> %1 to <32 x i8>
582 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
586 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
587 ; CHECK-LABEL: combine_unpack_unpack_pshufb:
589 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
590 ; CHECK-NEXT: ret{{[l|q]}}
591 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
592 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
593 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
594 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
595 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
596 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
600 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
601 ; CHECK-LABEL: shuffle_combine_packssdw_pshufb:
603 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
604 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
605 ; CHECK-NEXT: ret{{[l|q]}}
606 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
607 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
608 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
611 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
613 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
614 ; CHECK-LABEL: shuffle_combine_packsswb_pshufb:
616 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
617 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
618 ; CHECK-NEXT: ret{{[l|q]}}
619 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
620 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
621 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
622 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
625 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
627 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
628 ; CHECK-LABEL: shuffle_combine_packusdw_pshufb:
630 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
631 ; CHECK-NEXT: ret{{[l|q]}}
632 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
633 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
634 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
637 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
639 define <8 x i16> @shuffle_combine_packusdw_permq_extract(<8 x i32> %a0) {
640 ; CHECK-LABEL: shuffle_combine_packusdw_permq_extract:
642 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
643 ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
644 ; CHECK-NEXT: vzeroupper
645 ; CHECK-NEXT: ret{{[l|q]}}
646 %1 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> poison)
647 %2 = shufflevector <16 x i16> %1, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
651 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
652 ; CHECK-LABEL: shuffle_combine_packuswb_pshufb:
654 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
655 ; CHECK-NEXT: ret{{[l|q]}}
656 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
657 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
658 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
659 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
662 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
664 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
665 ; CHECK-LABEL: combine_pshufb_as_packsswb:
667 ; CHECK-NEXT: vpsraw $11, %ymm0, %ymm0
668 ; CHECK-NEXT: vpsraw $11, %ymm1, %ymm1
669 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
670 ; CHECK-NEXT: ret{{[l|q]}}
671 %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
672 %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
673 %3 = bitcast <16 x i16> %1 to <32 x i8>
674 %4 = bitcast <16 x i16> %2 to <32 x i8>
675 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
676 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
677 %7 = or <32 x i8> %5, %6
681 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
682 ; CHECK-LABEL: combine_pshufb_as_packuswb:
684 ; CHECK-NEXT: vpsrlw $11, %ymm0, %ymm0
685 ; CHECK-NEXT: vpsrlw $11, %ymm1, %ymm1
686 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
687 ; CHECK-NEXT: ret{{[l|q]}}
688 %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
689 %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
690 %3 = bitcast <16 x i16> %1 to <32 x i8>
691 %4 = bitcast <16 x i16> %2 to <32 x i8>
692 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
693 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
694 %7 = or <32 x i8> %5, %6
698 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
699 ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
701 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
704 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
706 ; X64-NEXT: vmovq %rdi, %xmm0
707 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
709 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
710 %2 = bitcast <2 x i64> %1 to <16 x i8>
711 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
715 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
716 ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64:
718 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
721 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
723 ; X64-NEXT: vmovq %rdi, %xmm0
724 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
726 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
727 %2 = bitcast <4 x i64> %1 to <8 x i32>
728 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
732 define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
733 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend:
735 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
736 ; CHECK-NEXT: ret{{[l|q]}}
737 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
738 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
739 %3 = or <32 x i8> %1, %2
743 define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
744 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
746 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
747 ; CHECK-NEXT: ret{{[l|q]}}
748 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
749 %2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
750 %3 = or <32 x i8> %1, %2
754 define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
755 ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
757 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
758 ; CHECK-NEXT: ret{{[l|q]}}
759 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
760 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
761 %3 = or <32 x i8> %1, %2
762 %4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
766 define <8 x i32> @constant_fold_permd() {
767 ; CHECK-LABEL: constant_fold_permd:
769 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
770 ; CHECK-NEXT: ret{{[l|q]}}
771 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
775 define <8 x float> @constant_fold_permps() {
776 ; CHECK-LABEL: constant_fold_permps:
778 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
779 ; CHECK-NEXT: ret{{[l|q]}}
780 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
784 define <32 x i8> @constant_fold_pshufb_256() {
785 ; CHECK-LABEL: constant_fold_pshufb_256:
787 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
788 ; CHECK-NEXT: ret{{[l|q]}}
789 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
793 define i32 @broadcast_v2i64_multiuse(ptr %p0) {
794 ; X86-LABEL: broadcast_v2i64_multiuse:
795 ; X86: # %bb.0: # %entry
796 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
797 ; X86-NEXT: movl (%eax), %eax
798 ; X86-NEXT: addl %eax, %eax
801 ; X64-LABEL: broadcast_v2i64_multiuse:
802 ; X64: # %bb.0: # %entry
803 ; X64-NEXT: movl (%rdi), %eax
804 ; X64-NEXT: addl %eax, %eax
807 %tmp = load i64, ptr %p0, align 8
808 %tmp1 = trunc i64 %tmp to i32
809 %tmp2 = insertelement <2 x i64> undef, i64 %tmp, i32 0
810 %tmp3 = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <2 x i32> zeroinitializer
811 %tmp4 = trunc <2 x i64> %tmp3 to <2 x i32>
812 %tmp5 = extractelement <2 x i32> %tmp4, i32 1
813 %tmp6 = add i32 %tmp1, %tmp5
817 define <32 x i8> @PR27320(<8 x i32> %a0) {
818 ; CHECK-LABEL: PR27320:
820 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
821 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,20,21,21,22,23,24,24,25,26,27,27,28,29,30,30,31]
822 ; CHECK-NEXT: ret{{[l|q]}}
823 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
824 %2 = bitcast <8 x i32> %1 to <32 x i8>
825 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
829 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
830 ; AVX2-LABEL: PR34577:
831 ; AVX2: # %bb.0: # %entry
832 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
833 ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
834 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
835 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
836 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
837 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
838 ; AVX2-NEXT: ret{{[l|q]}}
840 ; AVX512-LABEL: PR34577:
841 ; AVX512: # %bb.0: # %entry
842 ; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
843 ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
844 ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
845 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
846 ; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2>
847 ; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
848 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
849 ; AVX512-NEXT: ret{{[l|q]}}
851 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
852 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
853 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
854 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
855 ret <8 x float> %shuf2
858 define <32 x i8> @PR52122(<32 x i8> %0, <32 x i8> %1) {
859 ; CHECK-LABEL: PR52122:
861 ; CHECK-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[4],zero,zero,zero,ymm1[5],zero,zero,zero,ymm1[6],zero,zero,zero,ymm1[7],zero,zero,zero,ymm1[20],zero,zero,zero,ymm1[21],zero,zero,zero,ymm1[22],zero,zero,zero,ymm1[23],zero
862 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,4],zero,zero,zero,ymm0[5],zero,zero,ymm0[3,6],zero,zero,zero,ymm0[7],zero,zero,ymm0[18,20],zero,zero,zero,ymm0[21],zero,zero,ymm0[19,22],zero,zero,zero,ymm0[23],zero,zero
863 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
864 ; CHECK-NEXT: ret{{[l|q]}}
865 %3 = shufflevector <32 x i8> %0, <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
866 %4 = shufflevector <32 x i8> %3, <32 x i8> %1, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
867 %5 = shufflevector <32 x i8> %4, <32 x i8> %3, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
871 define void @PR63030(ptr %p0) {
872 ; X86-AVX2-LABEL: PR63030:
874 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
875 ; X86-AVX2-NEXT: vmovaps (%eax), %xmm0
876 ; X86-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [3,0,3,0]
877 ; X86-AVX2-NEXT: # xmm1 = mem[0,0]
878 ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0]
879 ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
880 ; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [3,0,2,0]
881 ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
882 ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
883 ; X86-AVX2-NEXT: vmovaps %ymm0, (%eax)
884 ; X86-AVX2-NEXT: vmovaps %ymm1, (%eax)
885 ; X86-AVX2-NEXT: vzeroupper
886 ; X86-AVX2-NEXT: retl
888 ; X86-AVX512-LABEL: PR63030:
889 ; X86-AVX512: # %bb.0:
890 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
891 ; X86-AVX512-NEXT: vmovdqa (%eax), %xmm0
892 ; X86-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,8,0,0,0,0,0,0,0,9,0,1,0,1,0]
893 ; X86-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1
894 ; X86-AVX512-NEXT: vmovdqa64 %zmm1, (%eax)
895 ; X86-AVX512-NEXT: vzeroupper
896 ; X86-AVX512-NEXT: retl
898 ; X64-AVX2-LABEL: PR63030:
900 ; X64-AVX2-NEXT: vmovaps (%rdi), %xmm0
901 ; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [3,3]
902 ; X64-AVX2-NEXT: # xmm1 = mem[0,0]
903 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0]
904 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
905 ; X64-AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [3,2]
906 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
907 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
908 ; X64-AVX2-NEXT: vmovaps %ymm0, (%rax)
909 ; X64-AVX2-NEXT: vmovaps %ymm1, (%rax)
910 ; X64-AVX2-NEXT: vzeroupper
911 ; X64-AVX2-NEXT: retq
913 ; X64-AVX512-LABEL: PR63030:
914 ; X64-AVX512: # %bb.0:
915 ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
916 ; X64-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,8,0,0,0,9,1,1]
917 ; X64-AVX512-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
918 ; X64-AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
919 ; X64-AVX512-NEXT: vzeroupper
920 ; X64-AVX512-NEXT: retq
921 %load = load <2 x i64>, ptr %p0, align 16
922 %shuffle = shufflevector <2 x i64> <i64 3, i64 2>, <2 x i64> %load, <8 x i32> <i32 3, i32 0, i32 2, i32 2, i32 2, i32 1, i32 3, i32 3>
923 store volatile <8 x i64> %shuffle, ptr poison, align 64
927 define void @packss_zext_v8i1() {
928 ; X86-LABEL: packss_zext_v8i1:
930 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
931 ; X86-NEXT: vmovups %ymm0, (%eax)
932 ; X86-NEXT: vzeroupper
935 ; X64-LABEL: packss_zext_v8i1:
937 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
938 ; X64-NEXT: vmovups %ymm0, (%rax)
939 ; X64-NEXT: vzeroupper
941 %tmp0 = icmp sgt <8 x i32> undef, undef
942 %tmp1 = zext <8 x i1> %tmp0 to <8 x i32>
943 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
944 %tmp3 = trunc <16 x i32> %tmp2 to <16 x i16>
945 %tmp4 = add <16 x i16> zeroinitializer, %tmp3
946 %tmp6 = sext <16 x i16> %tmp4 to <16 x i32>
947 %tmp10 = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
948 %tmp11 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> %tmp10)
949 store <16 x i16> %tmp11, ptr undef, align 2