1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
5 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
7 declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
9 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
10 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
11 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
12 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
14 declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
15 declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
17 declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
18 declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
19 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
21 declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
22 declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
24 declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
25 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
26 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
28 define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
29 ; CHECK-LABEL: combine_permvar_8f64_identity:
31 ; CHECK-NEXT: ret{{[l|q]}}
32 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
33 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
36 define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
37 ; X86-LABEL: combine_permvar_8f64_identity_mask:
39 ; X86-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
40 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
41 ; X86-NEXT: kmovd %eax, %k1
42 ; X86-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
43 ; X86-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
44 ; X86-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
45 ; X86-NEXT: vmovapd %zmm1, %zmm0
48 ; X64-LABEL: combine_permvar_8f64_identity_mask:
50 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
51 ; X64-NEXT: kmovd %edi, %k1
52 ; X64-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
53 ; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
54 ; X64-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
55 ; X64-NEXT: vmovapd %zmm1, %zmm0
57 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
58 %2 = bitcast i8 %m to <8 x i1>
59 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
60 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
61 %5 = bitcast i8 %m to <8 x i1>
62 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3
66 define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
67 ; CHECK-LABEL: combine_permvar_8i64_identity:
69 ; CHECK-NEXT: ret{{[l|q]}}
70 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
71 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
74 define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
75 ; X86-LABEL: combine_permvar_8i64_identity_mask:
77 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
78 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
79 ; X86-NEXT: kmovd %eax, %k1
80 ; X86-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
81 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
82 ; X86-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
83 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
86 ; X64-LABEL: combine_permvar_8i64_identity_mask:
88 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
89 ; X64-NEXT: kmovd %edi, %k1
90 ; X64-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
91 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
92 ; X64-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
93 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
95 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
96 %2 = bitcast i8 %m to <8 x i1>
97 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
98 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
99 %5 = bitcast i8 %m to <8 x i1>
100 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3
104 define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
105 ; CHECK-LABEL: combine_vpermt2var_8f64_identity:
107 ; CHECK-NEXT: ret{{[l|q]}}
108 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
109 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
110 ret <8 x double> %res1
112 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
113 ; X86-LABEL: combine_vpermt2var_8f64_identity_mask:
115 ; X86-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
116 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
117 ; X86-NEXT: kmovd %eax, %k1
118 ; X86-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
119 ; X86-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
120 ; X86-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
123 ; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
125 ; X64-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
126 ; X64-NEXT: kmovd %edi, %k1
127 ; X64-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
128 ; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
129 ; X64-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
131 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
132 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
133 ret <8 x double> %res1
136 define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
137 ; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
139 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
140 ; CHECK-NEXT: ret{{[l|q]}}
141 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
142 ret <8 x double> %res0
144 define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
145 ; X86-LABEL: combine_vpermt2var_8f64_movddup_load:
147 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
148 ; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
151 ; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
153 ; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
155 %x0 = load <8 x double>, <8 x double> *%p0
156 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
157 ret <8 x double> %res0
159 define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
160 ; X86-LABEL: combine_vpermt2var_8f64_movddup_mask:
162 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
163 ; X86-NEXT: kmovd %eax, %k1
164 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
167 ; X64-LABEL: combine_vpermt2var_8f64_movddup_mask:
169 ; X64-NEXT: kmovd %edi, %k1
170 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
172 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
173 ret <8 x double> %res0
176 define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
177 ; CHECK-LABEL: combine_vpermt2var_8i64_identity:
179 ; CHECK-NEXT: ret{{[l|q]}}
180 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
181 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
184 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
185 ; X86-LABEL: combine_vpermt2var_8i64_identity_mask:
187 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
188 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
189 ; X86-NEXT: kmovd %eax, %k1
190 ; X86-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
191 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
192 ; X86-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
195 ; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
197 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
198 ; X64-NEXT: kmovd %edi, %k1
199 ; X64-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
200 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
201 ; X64-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
203 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
204 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
208 define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
209 ; CHECK-LABEL: combine_vpermt2var_16f32_identity:
211 ; CHECK-NEXT: ret{{[l|q]}}
212 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
213 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
214 ret <16 x float> %res1
216 define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
217 ; X86-LABEL: combine_vpermt2var_16f32_identity_mask:
219 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
220 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
221 ; X86-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
222 ; X86-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
223 ; X86-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
226 ; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
228 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
229 ; X64-NEXT: kmovd %edi, %k1
230 ; X64-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
231 ; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
232 ; X64-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
234 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
235 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
236 ret <16 x float> %res1
239 define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
240 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
242 ; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
243 ; CHECK-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0
244 ; CHECK-NEXT: ret{{[l|q]}}
245 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
246 ret <16 x float> %res0
248 define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
249 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load:
251 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
252 ; X86-NEXT: vmovaps (%eax), %zmm1
253 ; X86-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
254 ; X86-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0
257 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
259 ; X64-NEXT: vmovaps (%rdi), %zmm1
260 ; X64-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
261 ; X64-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0
263 %x0 = load <16 x float>, <16 x float> *%p0
264 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
265 ret <16 x float> %res0
267 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
268 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
270 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
271 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
272 ; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
275 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
277 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
278 ; X64-NEXT: kmovd %edi, %k1
279 ; X64-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
281 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
282 ret <16 x float> %res0
284 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
285 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
287 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X86-NEXT: vmovaps (%eax), %zmm1
289 ; X86-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
290 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
291 ; X86-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
294 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
296 ; X64-NEXT: vmovaps (%rdi), %zmm1
297 ; X64-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
298 ; X64-NEXT: kmovd %esi, %k1
299 ; X64-NEXT: vpermi2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
301 %x0 = load <16 x float>, <16 x float> *%p0
302 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
303 ret <16 x float> %res0
306 define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
307 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
309 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
310 ; CHECK-NEXT: ret{{[l|q]}}
311 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
312 ret <16 x float> %res0
314 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
315 ; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
317 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
318 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
321 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
323 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
325 %x0 = load <16 x float>, <16 x float> *%p0
326 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
327 ret <16 x float> %res0
329 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
330 ; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
332 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
333 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
336 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
338 ; X64-NEXT: kmovd %edi, %k1
339 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
341 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
342 ret <16 x float> %res0
345 define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
346 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
348 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
349 ; CHECK-NEXT: ret{{[l|q]}}
350 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
351 ret <16 x float> %res0
353 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
354 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
356 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
357 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
360 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
362 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
364 %x0 = load <16 x float>, <16 x float> *%p0
365 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
366 ret <16 x float> %res0
368 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
369 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
371 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
372 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
375 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
377 ; X64-NEXT: kmovd %edi, %k1
378 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
380 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
381 ret <16 x float> %res0
383 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
384 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
386 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
387 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
388 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
391 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
393 ; X64-NEXT: kmovd %esi, %k1
394 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
396 %x0 = load <16 x float>, <16 x float> *%p0
397 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
398 ret <16 x float> %res0
401 define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
402 ; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
404 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
405 ; CHECK-NEXT: ret{{[l|q]}}
406 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
407 ret <16 x float> %res0
409 define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
410 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load:
412 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
413 ; X86-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
416 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
418 ; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
420 %x0 = load <16 x float>, <16 x float> *%p0
421 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
422 ret <16 x float> %res0
424 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
425 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
427 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
428 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
431 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
433 ; X64-NEXT: kmovd %edi, %k1
434 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
436 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
437 ret <16 x float> %res0
439 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
440 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
442 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
443 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
444 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
447 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
449 ; X64-NEXT: kmovd %esi, %k1
450 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
452 %x0 = load <16 x float>, <16 x float> *%p0
453 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
454 ret <16 x float> %res0
457 define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
458 ; CHECK-LABEL: combine_vpermt2var_16i32_identity:
460 ; CHECK-NEXT: ret{{[l|q]}}
461 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
462 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
465 define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
466 ; X86-LABEL: combine_vpermt2var_16i32_identity_mask:
468 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
469 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
470 ; X86-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
471 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
472 ; X86-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
475 ; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
477 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
478 ; X64-NEXT: kmovd %edi, %k1
479 ; X64-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
480 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
481 ; X64-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
483 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
484 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
488 define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
489 ; CHECK-LABEL: combine_vpermt2var_32i16_identity:
491 ; CHECK-NEXT: ret{{[l|q]}}
492 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
493 %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
496 define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
497 ; X86-LABEL: combine_vpermt2var_32i16_identity_mask:
499 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
500 ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
501 ; X86-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
502 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
503 ; X86-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
506 ; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
508 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
509 ; X64-NEXT: kmovd %edi, %k1
510 ; X64-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
511 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
512 ; X64-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
514 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
515 %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
519 define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
520 ; CHECK-LABEL: combine_pshufb_identity:
522 ; CHECK-NEXT: ret{{[l|q]}}
523 %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
524 %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
525 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
526 %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
529 define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
530 ; X86-LABEL: combine_pshufb_identity_mask:
532 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
533 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
534 ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1
535 ; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
536 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
537 ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
538 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
541 ; X64-LABEL: combine_pshufb_identity_mask:
543 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
544 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
545 ; X64-NEXT: kmovq %rdi, %k1
546 ; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
547 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
548 ; X64-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
549 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
551 %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
552 %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
553 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
554 %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
558 define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
559 ; CHECK-LABEL: combine_permvar_as_vpbroadcastw512:
561 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
562 ; CHECK-NEXT: ret{{[l|q]}}
563 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer)
567 define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
568 ; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
570 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
571 ; CHECK-NEXT: ret{{[l|q]}}
572 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer)
576 define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
577 ; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
579 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
580 ; CHECK-NEXT: ret{{[l|q]}}
581 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer)
585 define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
586 ; CHECK-LABEL: combine_permvar_8i64_as_permq:
588 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
589 ; CHECK-NEXT: ret{{[l|q]}}
590 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
593 define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
594 ; X86-LABEL: combine_permvar_8i64_as_permq_mask:
596 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
597 ; X86-NEXT: kmovd %eax, %k1
598 ; X86-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
599 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
602 ; X64-LABEL: combine_permvar_8i64_as_permq_mask:
604 ; X64-NEXT: kmovd %edi, %k1
605 ; X64-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
606 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
608 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
609 %2 = bitcast i8 %m to <8 x i1>
610 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
614 define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
615 ; CHECK-LABEL: combine_permvar_8f64_as_permpd:
617 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
618 ; CHECK-NEXT: ret{{[l|q]}}
619 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
622 define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
623 ; X86-LABEL: combine_permvar_8f64_as_permpd_mask:
625 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
626 ; X86-NEXT: kmovd %eax, %k1
627 ; X86-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
628 ; X86-NEXT: vmovapd %zmm1, %zmm0
631 ; X64-LABEL: combine_permvar_8f64_as_permpd_mask:
633 ; X64-NEXT: kmovd %edi, %k1
634 ; X64-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
635 ; X64-NEXT: vmovapd %zmm1, %zmm0
637 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
638 %2 = bitcast i8 %m to <8 x i1>
639 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
643 define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
644 ; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
646 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
647 ; CHECK-NEXT: ret{{[l|q]}}
648 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
649 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
650 ret <16 x float> %res1
653 define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
654 ; CHECK-LABEL: combine_pshufb_as_pslldq:
656 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
657 ; CHECK-NEXT: ret{{[l|q]}}
658 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
661 define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
662 ; X86-LABEL: combine_pshufb_as_pslldq_mask:
664 ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1
665 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
668 ; X64-LABEL: combine_pshufb_as_pslldq_mask:
670 ; X64-NEXT: kmovq %rdi, %k1
671 ; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
673 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> zeroinitializer, i64 %m)
677 define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
678 ; CHECK-LABEL: combine_pshufb_as_psrldq:
680 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
681 ; CHECK-NEXT: ret{{[l|q]}}
682 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
685 define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
686 ; X86-LABEL: combine_pshufb_as_psrldq_mask:
688 ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1
689 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
692 ; X64-LABEL: combine_pshufb_as_psrldq_mask:
694 ; X64-NEXT: kmovq %rdi, %k1
695 ; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
697 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> zeroinitializer, i64 %m)
701 define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
702 ; CHECK-LABEL: combine_permvar_as_pshuflw:
704 ; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
705 ; CHECK-NEXT: ret{{[l|q]}}
706 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
710 define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
711 ; CHECK-LABEL: combine_pshufb_as_pshufhw:
713 ; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
714 ; CHECK-NEXT: ret{{[l|q]}}
715 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
719 define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
720 ; CHECK-LABEL: combine_pshufb_as_packsswb:
722 ; CHECK-NEXT: vpsraw $11, %zmm0, %zmm0
723 ; CHECK-NEXT: vpsraw $11, %zmm1, %zmm1
724 ; CHECK-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
725 ; CHECK-NEXT: ret{{[l|q]}}
726 %1 = ashr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
727 %2 = ashr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
728 %3 = bitcast <32 x i16> %1 to <64 x i8>
729 %4 = bitcast <32 x i16> %2 to <64 x i8>
730 %5 = tail call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %3, <64 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> undef, i64 -1)
731 %6 = tail call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %4, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>, <64 x i8> undef, i64 -1)
732 %7 = or <64 x i8> %5, %6
736 define <64 x i8> @combine_pshufb_as_packuswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
737 ; CHECK-LABEL: combine_pshufb_as_packuswb:
739 ; CHECK-NEXT: vpsrlw $11, %zmm0, %zmm0
740 ; CHECK-NEXT: vpsrlw $11, %zmm1, %zmm1
741 ; CHECK-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
742 ; CHECK-NEXT: ret{{[l|q]}}
743 %1 = lshr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
744 %2 = lshr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
745 %3 = bitcast <32 x i16> %1 to <64 x i8>
746 %4 = bitcast <32 x i16> %2 to <64 x i8>
747 %5 = tail call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %3, <64 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <64 x i8> undef, i64 -1)
748 %6 = tail call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %4, <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>, <64 x i8> undef, i64 -1)
749 %7 = or <64 x i8> %5, %6
753 define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) {
754 ; CHECK-LABEL: combine_vpermi2var_32i16_as_pshufb:
756 ; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
757 ; CHECK-NEXT: ret{{[l|q]}}
758 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
759 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %1, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
763 define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
764 ; CHECK-LABEL: combine_vpermi2var_8f64_identity:
766 ; CHECK-NEXT: ret{{[l|q]}}
767 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
768 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
769 ret <8 x double> %res1
772 define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
773 ; CHECK-LABEL: combine_vpermi2var_8f64_as_shufpd:
775 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
776 ; CHECK-NEXT: ret{{[l|q]}}
777 %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
781 define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
782 ; CHECK-LABEL: combine_vpermi2var_8i64_identity:
784 ; CHECK-NEXT: ret{{[l|q]}}
785 %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
786 %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
790 define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
791 ; CHECK-LABEL: combine_vpermi2var_16f32_identity:
793 ; CHECK-NEXT: ret{{[l|q]}}
794 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
795 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
796 ret <16 x float> %res1
799 define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
800 ; CHECK-LABEL: combine_vpermi2var_16i32_identity:
802 ; CHECK-NEXT: ret{{[l|q]}}
803 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
804 %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
808 define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
809 ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
811 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
812 ; CHECK-NEXT: ret{{[l|q]}}
813 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
814 ret <16 x float> %res0
817 define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
818 ; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
820 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
821 ; CHECK-NEXT: ret{{[l|q]}}
822 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
826 define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
827 ; CHECK-LABEL: combine_vpermi2var_32i16_identity:
829 ; CHECK-NEXT: ret{{[l|q]}}
830 %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x1, i32 -1)
831 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, i32 -1)
835 define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
836 ; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd:
838 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
839 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0
842 ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
844 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
845 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
847 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
848 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1)
849 ret <8 x double> %res1
852 define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
853 ; X86-LABEL: combine_vpermt2var_8i64_as_vpermq:
855 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
856 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0
859 ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
861 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
862 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
864 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
865 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
869 define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
870 ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps:
872 ; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
873 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
874 ; CHECK-NEXT: ret{{[l|q]}}
875 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
876 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1)
877 ret <16 x float> %res1
880 define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
881 ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd:
883 ; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
884 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
885 ; CHECK-NEXT: ret{{[l|q]}}
886 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
887 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
891 define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
892 ; CHECK-LABEL: combine_vpermi2var_32i16_as_permw:
894 ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
895 ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
896 ; CHECK-NEXT: ret{{[l|q]}}
897 %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %x1, i32 -1)
898 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 31, i16 1, i16 30, i16 2, i16 29, i16 3, i16 28, i16 4, i16 27, i16 5, i16 26, i16 6, i16 25, i16 7, i16 24, i16 8, i16 23, i16 9, i16 22, i16 10, i16 21, i16 11, i16 20, i16 12, i16 19, i16 13, i16 18, i16 14, i16 17, i16 15, i16 16>, <32 x i16> %res0, i32 -1)
902 define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
903 ; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
905 ; X86-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
906 ; X86-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
907 ; X86-NEXT: vmovapd %zmm2, %zmm0
910 ; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
912 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
913 ; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
914 ; X64-NEXT: vmovapd %zmm2, %zmm0
916 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
917 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
918 ret <8 x double> %res1
921 define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1, i64 %a2) {
922 ; X86-LABEL: combine_vpermi2var_8f64_as_permpd:
924 ; X86-NEXT: movl $2, %eax
925 ; X86-NEXT: vmovd %eax, %xmm2
926 ; X86-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
927 ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
928 ; X86-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm2, %ymm2
929 ; X86-NEXT: vinserti64x4 $1, {{\.LCPI.*}}, %zmm2, %zmm2
930 ; X86-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2
931 ; X86-NEXT: vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5]
934 ; X64-LABEL: combine_vpermi2var_8f64_as_permpd:
936 ; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6]
938 %res0 = insertelement <8 x i64> <i64 0, i64 2, i64 1, i64 3, i64 4, i64 6, i64 5, i64 7>, i64 %a2, i32 0
939 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1)
940 %res2 = shufflevector <8 x double> %res1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 1, i32 6, i32 7, i32 5, i32 5>
941 ret <8 x double> %res2
944 define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
945 ; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
947 ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
948 ; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
949 ; CHECK-NEXT: ret{{[l|q]}}
950 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1)
951 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
955 define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
956 ; CHECK-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
958 ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
959 ; CHECK-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
960 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
961 ; CHECK-NEXT: ret{{[l|q]}}
962 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 0, i16 63, i16 1, i16 61, i16 2, i16 59, i16 3, i16 57, i16 4, i16 55, i16 5, i16 53, i16 6, i16 51, i16 7, i16 49, i16 8, i16 47, i16 9, i16 45, i16 10, i16 43, i16 11, i16 41, i16 12, i16 39, i16 13, i16 37, i16 14, i16 35, i16 15, i16 33>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
963 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %res0, i32 -1)
967 define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
968 ; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
970 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
971 ; X86-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
972 ; X86-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
975 ; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
977 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
978 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
979 ; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
981 %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
982 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>)
986 define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
987 ; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
989 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
990 ; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
991 ; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
992 ; CHECK-NEXT: ret{{[l|q]}}
993 %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
994 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1)
995 ret <16 x float> %res1
998 define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
999 ; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
1001 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
1004 ; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
1006 ; X64-NEXT: vpbroadcastq %rdi, %zmm0
1008 %1 = insertelement <8 x i64> undef, i64 %a0, i32 0
1009 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)