1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X86,AVX2,X86-AVX2
3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX512,X86-AVX512
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64,AVX2,X64-AVX2
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX512,X64-AVX512
7 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
8 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
9 declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
10 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
12 define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
13 ; CHECK-LABEL: combine_pshufb_pslldq:
15 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
16 ; CHECK-NEXT: ret{{[l|q]}}
17 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
18 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
22 define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
23 ; CHECK-LABEL: combine_pshufb_psrldq:
25 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
26 ; CHECK-NEXT: ret{{[l|q]}}
27 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
28 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
32 define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
33 ; CHECK-LABEL: combine_pshufb_vpermd:
35 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
36 ; CHECK-NEXT: ret{{[l|q]}}
37 %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
38 %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
39 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
43 define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
44 ; CHECK-LABEL: combine_pshufb_vpermps:
46 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
47 ; CHECK-NEXT: ret{{[l|q]}}
48 %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
49 %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
50 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
54 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
55 ; CHECK-LABEL: combine_and_pshufb:
57 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
58 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
59 ; CHECK-NEXT: ret{{[l|q]}}
60 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
61 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
65 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
66 ; CHECK-LABEL: combine_pshufb_and:
68 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
69 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
70 ; CHECK-NEXT: ret{{[l|q]}}
71 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
72 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
76 define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
77 ; X86-LABEL: combine_permq_pshufb_as_vperm2i128:
79 ; X86-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
80 ; X86-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
83 ; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
85 ; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
86 ; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
88 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
89 %2 = bitcast <4 x i64> %1 to <32 x i8>
90 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
91 %4 = bitcast <32 x i8> %3 to <4 x i64>
92 %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
96 define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
97 ; CHECK-LABEL: combine_as_vpermd:
99 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
100 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
101 ; CHECK-NEXT: ret{{[l|q]}}
102 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
103 %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
104 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
108 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
109 ; CHECK-LABEL: combine_as_vpermps:
111 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
112 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
113 ; CHECK-NEXT: ret{{[l|q]}}
114 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
115 %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
116 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
120 define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
121 ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
123 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
124 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
125 ; CHECK-NEXT: ret{{[l|q]}}
126 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
127 %2 = bitcast <4 x i64> %1 to <32 x i8>
128 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
132 define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
133 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
135 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
136 ; CHECK-NEXT: ret{{[l|q]}}
137 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
141 define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
142 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
144 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
145 ; CHECK-NEXT: ret{{[l|q]}}
146 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
147 %2 = bitcast <4 x i64> %1 to <32 x i8>
148 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
149 %4 = bitcast <32 x i8> %3 to <8 x i32>
150 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
151 %6 = bitcast <8 x i32> %5 to <32 x i8>
155 define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
156 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
158 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
159 ; CHECK-NEXT: ret{{[l|q]}}
160 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
164 define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
165 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
167 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
168 ; CHECK-NEXT: ret{{[l|q]}}
169 %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
170 %2 = bitcast <4 x i64> %1 to <32 x i8>
171 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
172 %4 = bitcast <32 x i8> %3 to <8 x i32>
173 %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
174 %6 = bitcast <8 x i32> %5 to <32 x i8>
178 define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
179 ; X86-LABEL: combine_pshufb_as_vpbroadcastd128:
181 ; X86-NEXT: vpbroadcastd %xmm0, %xmm0
182 ; X86-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0
185 ; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
187 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0
188 ; X64-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
190 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
191 %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
195 define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
196 ; X86-LABEL: combine_permd_as_vpbroadcastd256:
198 ; X86-NEXT: vpbroadcastd %xmm0, %ymm0
199 ; X86-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
202 ; X64-LABEL: combine_permd_as_vpbroadcastd256:
204 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0
205 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
207 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
208 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
209 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
213 define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
214 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
216 ; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
217 ; CHECK-NEXT: ret{{[l|q]}}
218 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
222 define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
223 ; X86-LABEL: combine_permd_as_vpbroadcastq256:
225 ; X86-NEXT: vpbroadcastq %xmm0, %ymm0
226 ; X86-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
229 ; X64-LABEL: combine_permd_as_vpbroadcastq256:
231 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
232 ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
234 %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
235 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
236 %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
240 define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
241 ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
243 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
244 ; CHECK-NEXT: ret{{[l|q]}}
245 %1 = bitcast <4 x float> %a to <16 x i8>
246 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
247 %3 = bitcast <16 x i8> %2 to <4 x float>
251 define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
252 ; CHECK-LABEL: combine_permps_as_vpbroadcastss256:
254 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
255 ; CHECK-NEXT: ret{{[l|q]}}
256 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
257 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
261 define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
262 ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256:
264 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
265 ; CHECK-NEXT: ret{{[l|q]}}
266 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
267 %2 = bitcast <4 x double> %1 to <8 x float>
268 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
269 %4 = bitcast <8 x float> %3 to <4 x double>
273 define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
274 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
276 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
277 ; CHECK-NEXT: ret{{[l|q]}}
278 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
279 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
283 define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
284 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
286 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
287 ; CHECK-NEXT: ret{{[l|q]}}
288 %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
289 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
293 define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
294 ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
296 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
297 ; CHECK-NEXT: ret{{[l|q]}}
298 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
299 %2 = bitcast <4 x float> %1 to <16 x i8>
300 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
301 %4 = bitcast <16 x i8> %3 to <4 x float>
305 define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
306 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
308 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
309 ; CHECK-NEXT: ret{{[l|q]}}
310 %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
311 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
315 define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
316 ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
318 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
319 ; CHECK-NEXT: ret{{[l|q]}}
320 %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
321 %2 = bitcast <4 x double> %1 to <8 x float>
322 %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
323 %4 = bitcast <8 x float> %3 to <4 x double>
327 define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
328 ; CHECK-LABEL: combine_permd_as_permq:
330 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
331 ; CHECK-NEXT: ret{{[l|q]}}
332 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
336 define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
337 ; CHECK-LABEL: combine_permps_as_permpd:
339 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
340 ; CHECK-NEXT: ret{{[l|q]}}
341 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
345 define <8 x float> @combine_permps_as_vpermilps(<8 x float> %a, i32 %a1) {
346 ; CHECK-LABEL: combine_permps_as_vpermilps:
348 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,7,6,5,4]
349 ; CHECK-NEXT: ret{{[l|q]}}
350 %1 = insertelement <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>, i32 %a1, i32 0
351 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> %1)
352 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
356 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
357 ; CHECK-LABEL: combine_pshufb_as_zext:
359 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
360 ; CHECK-NEXT: ret{{[l|q]}}
361 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
362 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
363 %3 = bitcast <32 x i8> %2 to <4 x i64>
367 define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
368 ; CHECK-LABEL: combine_pshufb_as_zext128:
370 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
371 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
372 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
373 ; CHECK-NEXT: ret{{[l|q]}}
374 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
375 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
376 %3 = bitcast <32 x i8> %2 to <4 x i64>
380 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
381 ; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
383 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
384 ; CHECK-NEXT: ret{{[l|q]}}
385 %1 = bitcast <4 x double> %a0 to <32 x i8>
386 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
387 %3 = bitcast <32 x i8> %2 to <4 x double>
391 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
392 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
394 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
395 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
396 ; CHECK-NEXT: ret{{[l|q]}}
397 %1 = bitcast <8 x float> %a0 to <32 x i8>
398 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
399 %3 = bitcast <32 x i8> %2 to <8 x float>
403 define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
404 ; CHECK-LABEL: combine_pshufb_as_pslldq:
406 ; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
407 ; CHECK-NEXT: ret{{[l|q]}}
408 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
412 define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
413 ; CHECK-LABEL: combine_pshufb_as_psrldq:
415 ; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
416 ; CHECK-NEXT: ret{{[l|q]}}
417 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
421 define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
422 ; CHECK-LABEL: combine_pshufb_as_psrlw:
424 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
425 ; CHECK-NEXT: ret{{[l|q]}}
426 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
430 define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
431 ; CHECK-LABEL: combine_pshufb_as_pslld:
433 ; CHECK-NEXT: vpslld $24, %ymm0, %ymm0
434 ; CHECK-NEXT: ret{{[l|q]}}
435 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
439 define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
440 ; CHECK-LABEL: combine_pshufb_as_psrlq:
442 ; CHECK-NEXT: vpsrlq $40, %ymm0, %ymm0
443 ; CHECK-NEXT: ret{{[l|q]}}
444 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
448 define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
449 ; CHECK-LABEL: combine_pshufb_as_pshuflw:
451 ; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
452 ; CHECK-NEXT: ret{{[l|q]}}
453 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
457 define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
458 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
460 ; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
461 ; CHECK-NEXT: ret{{[l|q]}}
462 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
466 define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
467 ; CHECK-LABEL: combine_pshufb_not_as_pshufw:
469 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
470 ; CHECK-NEXT: ret{{[l|q]}}
471 %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
472 %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
476 define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
477 ; CHECK-LABEL: combine_pshufb_as_unpacklo_undef:
479 ; CHECK-NEXT: ret{{[l|q]}}
480 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
481 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
485 define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
486 ; CHECK-LABEL: combine_pshufb_as_unpacklo_zero:
488 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
489 ; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
490 ; CHECK-NEXT: ret{{[l|q]}}
491 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
495 define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
496 ; CHECK-LABEL: combine_pshufb_as_unpackhi_zero:
498 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
499 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
500 ; CHECK-NEXT: ret{{[l|q]}}
501 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
505 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
506 ; X86-LABEL: combine_psrlw_pshufb:
508 ; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
511 ; X64-LABEL: combine_psrlw_pshufb:
513 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
515 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
516 %2 = bitcast <16 x i16> %1 to <32 x i8>
517 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
521 define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
522 ; X86-LABEL: combine_pslld_pshufb:
524 ; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
527 ; X64-LABEL: combine_pslld_pshufb:
529 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
531 %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
532 %2 = bitcast <8 x i32> %1 to <32 x i8>
533 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
537 define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
538 ; CHECK-LABEL: combine_psrlq_pshufb:
540 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
541 ; CHECK-NEXT: ret{{[l|q]}}
542 %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
543 %2 = bitcast <4 x i64> %1 to <32 x i8>
544 %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
548 define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
549 ; CHECK-LABEL: combine_unpack_unpack_pshufb:
551 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
552 ; CHECK-NEXT: ret{{[l|q]}}
553 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
554 %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
555 %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
556 %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
557 %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
558 %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
562 define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
563 ; CHECK-LABEL: shuffle_combine_packssdw_pshufb:
565 ; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
566 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
567 ; CHECK-NEXT: ret{{[l|q]}}
568 %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
569 %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
570 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
573 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
575 define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
576 ; CHECK-LABEL: shuffle_combine_packsswb_pshufb:
578 ; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
579 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
580 ; CHECK-NEXT: ret{{[l|q]}}
581 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
582 %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
583 %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
584 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
587 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
589 define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
590 ; CHECK-LABEL: shuffle_combine_packusdw_pshufb:
592 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
593 ; CHECK-NEXT: ret{{[l|q]}}
594 %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
595 %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
596 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
599 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
601 define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
602 ; CHECK-LABEL: shuffle_combine_packuswb_pshufb:
604 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
605 ; CHECK-NEXT: ret{{[l|q]}}
606 %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
607 %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
608 %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
609 %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
612 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
614 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
615 ; CHECK-LABEL: combine_pshufb_as_packsswb:
617 ; CHECK-NEXT: vpsraw $11, %ymm0, %ymm0
618 ; CHECK-NEXT: vpsraw $11, %ymm1, %ymm1
619 ; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
620 ; CHECK-NEXT: ret{{[l|q]}}
621 %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
622 %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
623 %3 = bitcast <16 x i16> %1 to <32 x i8>
624 %4 = bitcast <16 x i16> %2 to <32 x i8>
625 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
626 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
627 %7 = or <32 x i8> %5, %6
631 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
632 ; CHECK-LABEL: combine_pshufb_as_packuswb:
634 ; CHECK-NEXT: vpsrlw $11, %ymm0, %ymm0
635 ; CHECK-NEXT: vpsrlw $11, %ymm1, %ymm1
636 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
637 ; CHECK-NEXT: ret{{[l|q]}}
638 %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
639 %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
640 %3 = bitcast <16 x i16> %1 to <32 x i8>
641 %4 = bitcast <16 x i16> %2 to <32 x i8>
642 %5 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
643 %6 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %4, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
644 %7 = or <32 x i8> %5, %6
648 define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
649 ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
651 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
654 ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
656 ; X64-NEXT: vmovq %rdi, %xmm0
657 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0
659 %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
660 %2 = bitcast <2 x i64> %1 to <16 x i8>
661 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
665 define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
666 ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64:
668 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
671 ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
673 ; X64-NEXT: vmovq %rdi, %xmm0
674 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0
676 %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
677 %2 = bitcast <4 x i64> %1 to <8 x i32>
678 %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
682 define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) {
683 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend:
685 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
686 ; CHECK-NEXT: ret{{[l|q]}}
687 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
688 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
689 %3 = or <32 x i8> %1, %2
693 define <32 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<32 x i8> %a0, <32 x i8> %a1) {
694 ; CHECK-LABEL: combine_pshufb_pshufb_or_as_unpcklbw:
696 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
697 ; CHECK-NEXT: ret{{[l|q]}}
698 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>)
699 %2 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>)
700 %3 = or <32 x i8> %1, %2
704 define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
705 ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb:
707 ; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
708 ; CHECK-NEXT: ret{{[l|q]}}
709 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>)
710 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>)
711 %3 = or <32 x i8> %1, %2
712 %4 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
716 define <8 x i32> @constant_fold_permd() {
717 ; CHECK-LABEL: constant_fold_permd:
719 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
720 ; CHECK-NEXT: ret{{[l|q]}}
721 %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
725 define <8 x float> @constant_fold_permps() {
726 ; CHECK-LABEL: constant_fold_permps:
728 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
729 ; CHECK-NEXT: ret{{[l|q]}}
730 %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
734 define <32 x i8> @constant_fold_pshufb_256() {
735 ; CHECK-LABEL: constant_fold_pshufb_256:
737 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
738 ; CHECK-NEXT: ret{{[l|q]}}
739 %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
743 define <32 x i8> @PR27320(<8 x i32> %a0) {
744 ; CHECK-LABEL: PR27320:
746 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
747 ; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
748 ; CHECK-NEXT: ret{{[l|q]}}
749 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
750 %2 = bitcast <8 x i32> %1 to <32 x i8>
751 %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
755 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
756 ; CHECK-LABEL: PR34577:
757 ; CHECK: # %bb.0: # %entry
758 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
759 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
760 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
761 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
762 ; CHECK-NEXT: vpermps %ymm1, %ymm2, %ymm1
763 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
764 ; CHECK-NEXT: ret{{[l|q]}}
766 %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
767 %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
768 %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
769 %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
770 ret <8 x float> %shuf2
773 define void @packss_zext_v8i1() {
774 ; X86-LABEL: packss_zext_v8i1:
776 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
777 ; X86-NEXT: vmovups %ymm0, (%eax)
778 ; X86-NEXT: vzeroupper
781 ; X64-LABEL: packss_zext_v8i1:
783 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
784 ; X64-NEXT: vmovups %ymm0, (%rax)
785 ; X64-NEXT: vzeroupper
787 %tmp0 = icmp sgt <8 x i32> undef, undef
788 %tmp1 = zext <8 x i1> %tmp0 to <8 x i32>
789 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
790 %tmp3 = trunc <16 x i32> %tmp2 to <16 x i16>
791 %tmp4 = add <16 x i16> zeroinitializer, %tmp3
792 %tmp6 = sext <16 x i16> %tmp4 to <16 x i32>
793 %tmp10 = shufflevector <16 x i32> %tmp6, <16 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
794 %tmp11 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> %tmp10)
795 store <16 x i16> %tmp11, <16 x i16>* undef, align 2