1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512F
3 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512BW
7 declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
9 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
10 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
11 declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
13 declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
14 declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
16 declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
17 declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
19 declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
20 declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
22 declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
23 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
25 define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
26 ; CHECK-LABEL: combine_permvar_8f64_identity:
28 ; CHECK-NEXT: ret{{[l|q]}}
29 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
30 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
33 define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
34 ; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
35 ; X86-AVX512F: # %bb.0:
36 ; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
37 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
38 ; X86-AVX512F-NEXT: kmovw %eax, %k1
39 ; X86-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
40 ; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
41 ; X86-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
42 ; X86-AVX512F-NEXT: vmovapd %zmm1, %zmm0
43 ; X86-AVX512F-NEXT: retl
45 ; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
46 ; X86-AVX512BW: # %bb.0:
47 ; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
48 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
49 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
50 ; X86-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
51 ; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
52 ; X86-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
53 ; X86-AVX512BW-NEXT: vmovapd %zmm1, %zmm0
54 ; X86-AVX512BW-NEXT: retl
56 ; X64-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
57 ; X64-AVX512F: # %bb.0:
58 ; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
59 ; X64-AVX512F-NEXT: kmovw %edi, %k1
60 ; X64-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
61 ; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
62 ; X64-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
63 ; X64-AVX512F-NEXT: vmovapd %zmm1, %zmm0
64 ; X64-AVX512F-NEXT: retq
66 ; X64-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
67 ; X64-AVX512BW: # %bb.0:
68 ; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
69 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
70 ; X64-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
71 ; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
72 ; X64-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
73 ; X64-AVX512BW-NEXT: vmovapd %zmm1, %zmm0
74 ; X64-AVX512BW-NEXT: retq
75 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
76 %2 = bitcast i8 %m to <8 x i1>
77 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
78 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
79 %5 = bitcast i8 %m to <8 x i1>
80 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3
84 define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
85 ; CHECK-LABEL: combine_permvar_8i64_identity:
87 ; CHECK-NEXT: ret{{[l|q]}}
88 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
89 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
92 define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
93 ; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask:
94 ; X86-AVX512F: # %bb.0:
95 ; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
96 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
97 ; X86-AVX512F-NEXT: kmovw %eax, %k1
98 ; X86-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
99 ; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
100 ; X86-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
101 ; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
102 ; X86-AVX512F-NEXT: retl
104 ; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask:
105 ; X86-AVX512BW: # %bb.0:
106 ; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
107 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
108 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
109 ; X86-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
110 ; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
111 ; X86-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
112 ; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
113 ; X86-AVX512BW-NEXT: retl
115 ; X64-AVX512F-LABEL: combine_permvar_8i64_identity_mask:
116 ; X64-AVX512F: # %bb.0:
117 ; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
118 ; X64-AVX512F-NEXT: kmovw %edi, %k1
119 ; X64-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
120 ; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
121 ; X64-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
122 ; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
123 ; X64-AVX512F-NEXT: retq
125 ; X64-AVX512BW-LABEL: combine_permvar_8i64_identity_mask:
126 ; X64-AVX512BW: # %bb.0:
127 ; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
128 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
129 ; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
130 ; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
131 ; X64-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
132 ; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
133 ; X64-AVX512BW-NEXT: retq
134 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
135 %2 = bitcast i8 %m to <8 x i1>
136 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
137 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
138 %5 = bitcast i8 %m to <8 x i1>
139 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3
143 define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
144 ; CHECK-LABEL: combine_vpermt2var_8f64_identity:
146 ; CHECK-NEXT: ret{{[l|q]}}
147 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
148 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
149 ret <8 x double> %res1
151 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
152 ; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
153 ; X86-AVX512F: # %bb.0:
154 ; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
155 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
156 ; X86-AVX512F-NEXT: kmovw %eax, %k1
157 ; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
158 ; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
159 ; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
160 ; X86-AVX512F-NEXT: retl
162 ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
163 ; X86-AVX512BW: # %bb.0:
164 ; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
165 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
166 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
167 ; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
168 ; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
169 ; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
170 ; X86-AVX512BW-NEXT: retl
172 ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
173 ; X64-AVX512F: # %bb.0:
174 ; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
175 ; X64-AVX512F-NEXT: kmovw %edi, %k1
176 ; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
177 ; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
178 ; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
179 ; X64-AVX512F-NEXT: retq
181 ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
182 ; X64-AVX512BW: # %bb.0:
183 ; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
184 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
185 ; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
186 ; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
187 ; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
188 ; X64-AVX512BW-NEXT: retq
189 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
190 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
191 ret <8 x double> %res1
194 define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
195 ; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
197 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
198 ; CHECK-NEXT: ret{{[l|q]}}
199 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
200 ret <8 x double> %res0
202 define <8 x double> @combine_vpermt2var_8f64_movddup_load(ptr%p0, <8 x double> %x1) {
203 ; X86-LABEL: combine_vpermt2var_8f64_movddup_load:
205 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
206 ; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
209 ; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
211 ; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
213 %x0 = load <8 x double>, ptr%p0
214 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
215 ret <8 x double> %res0
217 define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
218 ; X86-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask:
219 ; X86-AVX512F: # %bb.0:
220 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
221 ; X86-AVX512F-NEXT: kmovw %eax, %k1
222 ; X86-AVX512F-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
223 ; X86-AVX512F-NEXT: retl
225 ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask:
226 ; X86-AVX512BW: # %bb.0:
227 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
228 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
229 ; X86-AVX512BW-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
230 ; X86-AVX512BW-NEXT: retl
232 ; X64-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask:
233 ; X64-AVX512F: # %bb.0:
234 ; X64-AVX512F-NEXT: kmovw %edi, %k1
235 ; X64-AVX512F-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
236 ; X64-AVX512F-NEXT: retq
238 ; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask:
239 ; X64-AVX512BW: # %bb.0:
240 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
241 ; X64-AVX512BW-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
242 ; X64-AVX512BW-NEXT: retq
243 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
244 ret <8 x double> %res0
247 define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
248 ; CHECK-LABEL: combine_vpermt2var_8i64_identity:
250 ; CHECK-NEXT: ret{{[l|q]}}
251 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
252 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
255 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
256 ; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
257 ; X86-AVX512F: # %bb.0:
258 ; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
259 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
260 ; X86-AVX512F-NEXT: kmovw %eax, %k1
261 ; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
262 ; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
263 ; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
264 ; X86-AVX512F-NEXT: retl
266 ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
267 ; X86-AVX512BW: # %bb.0:
268 ; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
269 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
270 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
271 ; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
272 ; X86-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
273 ; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
274 ; X86-AVX512BW-NEXT: retl
276 ; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
277 ; X64-AVX512F: # %bb.0:
278 ; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
279 ; X64-AVX512F-NEXT: kmovw %edi, %k1
280 ; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
281 ; X64-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
282 ; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
283 ; X64-AVX512F-NEXT: retq
285 ; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
286 ; X64-AVX512BW: # %bb.0:
287 ; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
288 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
289 ; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
290 ; X64-AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
291 ; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
292 ; X64-AVX512BW-NEXT: retq
293 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
294 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
298 define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
299 ; CHECK-LABEL: combine_vpermt2var_16f32_identity:
301 ; CHECK-NEXT: ret{{[l|q]}}
302 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
303 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
304 ret <16 x float> %res1
306 define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
307 ; X86-LABEL: combine_vpermt2var_16f32_identity_mask:
309 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
310 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
311 ; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
312 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
313 ; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
316 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
317 ; X64-AVX512F: # %bb.0:
318 ; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
319 ; X64-AVX512F-NEXT: kmovw %edi, %k1
320 ; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
321 ; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
322 ; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
323 ; X64-AVX512F-NEXT: retq
325 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
326 ; X64-AVX512BW: # %bb.0:
327 ; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
328 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
329 ; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
330 ; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
331 ; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
332 ; X64-AVX512BW-NEXT: retq
333 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
334 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
335 ret <16 x float> %res1
338 define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
339 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
341 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
342 ; CHECK-NEXT: ret{{[l|q]}}
343 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
344 ret <16 x float> %res0
346 define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(ptr%p0, <16 x float> %x1) {
347 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load:
349 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
350 ; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
353 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
355 ; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
357 %x0 = load <16 x float>, ptr%p0
358 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
359 ret <16 x float> %res0
361 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
362 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
364 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
365 ; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
368 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
369 ; X64-AVX512F: # %bb.0:
370 ; X64-AVX512F-NEXT: kmovw %edi, %k1
371 ; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
372 ; X64-AVX512F-NEXT: retq
374 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
375 ; X64-AVX512BW: # %bb.0:
376 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
377 ; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
378 ; X64-AVX512BW-NEXT: retq
379 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
380 ret <16 x float> %res0
382 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(ptr%p0, <16 x float> %x1, i16 %m) {
383 ; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
385 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
386 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
387 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
390 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
391 ; X64-AVX512F: # %bb.0:
392 ; X64-AVX512F-NEXT: kmovw %esi, %k1
393 ; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
394 ; X64-AVX512F-NEXT: retq
396 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
397 ; X64-AVX512BW: # %bb.0:
398 ; X64-AVX512BW-NEXT: kmovd %esi, %k1
399 ; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
400 ; X64-AVX512BW-NEXT: retq
401 %x0 = load <16 x float>, ptr%p0
402 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
403 ret <16 x float> %res0
406 define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
407 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
409 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
410 ; CHECK-NEXT: ret{{[l|q]}}
411 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
412 ret <16 x float> %res0
414 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(ptr%p0, <16 x float> %x1) {
415 ; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
417 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
418 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
421 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
423 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
425 %x0 = load <16 x float>, ptr%p0
426 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
427 ret <16 x float> %res0
429 define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
430 ; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
432 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
433 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
436 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
437 ; X64-AVX512F: # %bb.0:
438 ; X64-AVX512F-NEXT: kmovw %edi, %k1
439 ; X64-AVX512F-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
440 ; X64-AVX512F-NEXT: retq
442 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
443 ; X64-AVX512BW: # %bb.0:
444 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
445 ; X64-AVX512BW-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
446 ; X64-AVX512BW-NEXT: retq
447 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
448 ret <16 x float> %res0
451 define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
452 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
454 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
455 ; CHECK-NEXT: ret{{[l|q]}}
456 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
457 ret <16 x float> %res0
459 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(ptr%p0, <16 x float> %x1) {
460 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
462 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
463 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
466 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
468 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
470 %x0 = load <16 x float>, ptr%p0
471 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
472 ret <16 x float> %res0
474 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
475 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
477 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
478 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
481 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
482 ; X64-AVX512F: # %bb.0:
483 ; X64-AVX512F-NEXT: kmovw %edi, %k1
484 ; X64-AVX512F-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
485 ; X64-AVX512F-NEXT: retq
487 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
488 ; X64-AVX512BW: # %bb.0:
489 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
490 ; X64-AVX512BW-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
491 ; X64-AVX512BW-NEXT: retq
492 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
493 ret <16 x float> %res0
495 define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(ptr%p0, <16 x float> %x1, i16 %m) {
496 ; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
498 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
499 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
500 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
503 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
504 ; X64-AVX512F: # %bb.0:
505 ; X64-AVX512F-NEXT: kmovw %esi, %k1
506 ; X64-AVX512F-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
507 ; X64-AVX512F-NEXT: retq
509 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
510 ; X64-AVX512BW: # %bb.0:
511 ; X64-AVX512BW-NEXT: kmovd %esi, %k1
512 ; X64-AVX512BW-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
513 ; X64-AVX512BW-NEXT: retq
514 %x0 = load <16 x float>, ptr%p0
515 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
516 ret <16 x float> %res0
519 define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
520 ; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
522 ; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
523 ; CHECK-NEXT: ret{{[l|q]}}
524 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
525 ret <16 x float> %res0
527 define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(ptr%p0, <16 x float> %x1) {
528 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load:
530 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
531 ; X86-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
534 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
536 ; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
538 %x0 = load <16 x float>, ptr%p0
539 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
540 ret <16 x float> %res0
542 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
543 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
545 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
546 ; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
549 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
550 ; X64-AVX512F: # %bb.0:
551 ; X64-AVX512F-NEXT: kmovw %edi, %k1
552 ; X64-AVX512F-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
553 ; X64-AVX512F-NEXT: retq
555 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
556 ; X64-AVX512BW: # %bb.0:
557 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
558 ; X64-AVX512BW-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
559 ; X64-AVX512BW-NEXT: retq
560 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
561 ret <16 x float> %res0
563 define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(ptr%p0, <16 x float> %x1, i16 %m) {
564 ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
566 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
567 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
568 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
571 ; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
572 ; X64-AVX512F: # %bb.0:
573 ; X64-AVX512F-NEXT: kmovw %esi, %k1
574 ; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
575 ; X64-AVX512F-NEXT: retq
577 ; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
578 ; X64-AVX512BW: # %bb.0:
579 ; X64-AVX512BW-NEXT: kmovd %esi, %k1
580 ; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
581 ; X64-AVX512BW-NEXT: retq
582 %x0 = load <16 x float>, ptr%p0
583 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
584 ret <16 x float> %res0
587 define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
588 ; CHECK-LABEL: combine_vpermt2var_16i32_identity:
590 ; CHECK-NEXT: ret{{[l|q]}}
591 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
592 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
595 define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
596 ; X86-LABEL: combine_vpermt2var_16i32_identity_mask:
598 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
599 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
600 ; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
601 ; X86-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
602 ; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
605 ; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask:
606 ; X64-AVX512F: # %bb.0:
607 ; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
608 ; X64-AVX512F-NEXT: kmovw %edi, %k1
609 ; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
610 ; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
611 ; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
612 ; X64-AVX512F-NEXT: retq
614 ; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask:
615 ; X64-AVX512BW: # %bb.0:
616 ; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
617 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
618 ; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
619 ; X64-AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
620 ; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
621 ; X64-AVX512BW-NEXT: retq
622 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
623 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
627 define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
628 ; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
630 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
631 ; CHECK-NEXT: ret{{[l|q]}}
632 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer)
636 define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
637 ; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
639 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
640 ; CHECK-NEXT: ret{{[l|q]}}
641 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer)
645 define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
646 ; CHECK-LABEL: combine_permvar_8i64_as_permq:
648 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
649 ; CHECK-NEXT: ret{{[l|q]}}
650 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
653 define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
654 ; X86-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask:
655 ; X86-AVX512F: # %bb.0:
656 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
657 ; X86-AVX512F-NEXT: kmovw %eax, %k1
658 ; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
659 ; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
660 ; X86-AVX512F-NEXT: retl
662 ; X86-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask:
663 ; X86-AVX512BW: # %bb.0:
664 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
665 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
666 ; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
667 ; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
668 ; X86-AVX512BW-NEXT: retl
670 ; X64-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask:
671 ; X64-AVX512F: # %bb.0:
672 ; X64-AVX512F-NEXT: kmovw %edi, %k1
673 ; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
674 ; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
675 ; X64-AVX512F-NEXT: retq
677 ; X64-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask:
678 ; X64-AVX512BW: # %bb.0:
679 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
680 ; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
681 ; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
682 ; X64-AVX512BW-NEXT: retq
683 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
684 %2 = bitcast i8 %m to <8 x i1>
685 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
689 define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
690 ; CHECK-LABEL: combine_permvar_8f64_as_permpd:
692 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
693 ; CHECK-NEXT: ret{{[l|q]}}
694 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
697 define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
698 ; X86-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask:
699 ; X86-AVX512F: # %bb.0:
700 ; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
701 ; X86-AVX512F-NEXT: kmovw %eax, %k1
702 ; X86-AVX512F-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
703 ; X86-AVX512F-NEXT: vmovapd %zmm1, %zmm0
704 ; X86-AVX512F-NEXT: retl
706 ; X86-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask:
707 ; X86-AVX512BW: # %bb.0:
708 ; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
709 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
710 ; X86-AVX512BW-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
711 ; X86-AVX512BW-NEXT: vmovapd %zmm1, %zmm0
712 ; X86-AVX512BW-NEXT: retl
714 ; X64-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask:
715 ; X64-AVX512F: # %bb.0:
716 ; X64-AVX512F-NEXT: kmovw %edi, %k1
717 ; X64-AVX512F-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
718 ; X64-AVX512F-NEXT: vmovapd %zmm1, %zmm0
719 ; X64-AVX512F-NEXT: retq
721 ; X64-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask:
722 ; X64-AVX512BW: # %bb.0:
723 ; X64-AVX512BW-NEXT: kmovd %edi, %k1
724 ; X64-AVX512BW-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
725 ; X64-AVX512BW-NEXT: vmovapd %zmm1, %zmm0
726 ; X64-AVX512BW-NEXT: retq
727 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
728 %2 = bitcast i8 %m to <8 x i1>
729 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
733 define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
734 ; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
736 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
737 ; CHECK-NEXT: ret{{[l|q]}}
738 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
739 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
740 ret <16 x float> %res1
743 define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
744 ; CHECK-LABEL: combine_vpermi2var_8f64_identity:
746 ; CHECK-NEXT: ret{{[l|q]}}
747 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
748 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
749 ret <8 x double> %res1
752 define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
753 ; CHECK-LABEL: combine_vpermi2var_8f64_as_shufpd:
755 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
761 define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
762 ; CHECK-LABEL: combine_vpermi2var_8i64_identity:
764 ; CHECK-NEXT: ret{{[l|q]}}
765 %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
766 %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
770 define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
771 ; CHECK-LABEL: combine_vpermi2var_16f32_identity:
773 ; CHECK-NEXT: ret{{[l|q]}}
774 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
775 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
776 ret <16 x float> %res1
779 define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
780 ; CHECK-LABEL: combine_vpermi2var_16i32_identity:
782 ; CHECK-NEXT: ret{{[l|q]}}
783 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
784 %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
788 define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
789 ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
791 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
792 ; CHECK-NEXT: ret{{[l|q]}}
793 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
794 ret <16 x float> %res0
797 define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
798 ; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
800 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
801 ; CHECK-NEXT: ret{{[l|q]}}
802 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
806 define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
807 ; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd:
809 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
810 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0
813 ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
815 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
816 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
818 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
819 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1)
820 ret <8 x double> %res1
823 define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
824 ; X86-LABEL: combine_vpermt2var_8i64_as_vpermq:
826 ; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
827 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0
830 ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
832 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
833 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
835 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
836 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
840 define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
841 ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps:
843 ; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
844 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
845 ; CHECK-NEXT: ret{{[l|q]}}
846 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
847 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1)
848 ret <16 x float> %res1
851 define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
852 ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd:
854 ; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
855 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
856 ; CHECK-NEXT: ret{{[l|q]}}
857 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
858 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
862 define <16 x i32> @combine_vpermt2var_16i32_as_vpsrlq(<16 x i32> %x0) {
863 ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsrlq:
865 ; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm0
866 ; CHECK-NEXT: ret{{[l|q]}}
867 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1)
871 define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) {
872 ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsllq:
874 ; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0
875 ; CHECK-NEXT: ret{{[l|q]}}
876 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1)
880 define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
881 ; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
883 ; X86-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
884 ; X86-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
885 ; X86-NEXT: vmovapd %zmm2, %zmm0
888 ; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
890 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
891 ; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
892 ; X64-NEXT: vmovapd %zmm2, %zmm0
894 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
895 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
896 ret <8 x double> %res1
899 define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1, i64 %a2) {
900 ; X86-LABEL: combine_vpermi2var_8f64_as_permpd:
902 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
903 ; X86-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
904 ; X86-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
905 ; X86-NEXT: vinsertf64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm2, %zmm2
906 ; X86-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2
907 ; X86-NEXT: vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5]
910 ; X64-LABEL: combine_vpermi2var_8f64_as_permpd:
912 ; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6]
914 %res0 = insertelement <8 x i64> <i64 0, i64 2, i64 1, i64 3, i64 4, i64 6, i64 5, i64 7>, i64 %a2, i32 0
915 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1)
916 %res2 = shufflevector <8 x double> %res1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 1, i32 6, i32 7, i32 5, i32 5>
917 ret <8 x double> %res2
920 define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
921 ; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
923 ; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
924 ; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
925 ; CHECK-NEXT: ret{{[l|q]}}
926 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1)
927 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
931 define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
932 ; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
934 ; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
935 ; X86-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
936 ; X86-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
939 ; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
941 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
942 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
943 ; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
945 %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
946 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>)
950 define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
951 ; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
953 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
954 ; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
955 ; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
956 ; CHECK-NEXT: ret{{[l|q]}}
957 %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
958 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1)
959 ret <16 x float> %res1
962 define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
963 ; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
965 ; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
968 ; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
970 ; X64-NEXT: vpbroadcastq %rdi, %zmm0
972 %1 = insertelement <8 x i64> undef, i64 %a0, i32 0
973 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
977 define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
978 ; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
979 ; X86-AVX512F: # %bb.0:
980 ; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
981 ; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
982 ; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
983 ; X86-AVX512F-NEXT: kmovw %eax, %k1
984 ; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
985 ; X86-AVX512F-NEXT: retl
987 ; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
988 ; X86-AVX512BW: # %bb.0:
989 ; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
990 ; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
991 ; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
992 ; X86-AVX512BW-NEXT: kmovd %eax, %k1
993 ; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
994 ; X86-AVX512BW-NEXT: retl
996 ; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
997 ; X64-AVX512F: # %bb.0:
998 ; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
999 ; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1000 ; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A
1001 ; X64-AVX512F-NEXT: kmovw %eax, %k1
1002 ; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1003 ; X64-AVX512F-NEXT: retq
1005 ; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
1006 ; X64-AVX512BW: # %bb.0:
1007 ; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
1008 ; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1009 ; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A
1010 ; X64-AVX512BW-NEXT: kmovd %eax, %k1
1011 ; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1012 ; X64-AVX512BW-NEXT: retq
1013 %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
1014 %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
1015 %x0 = bitcast <8 x i64> %s0 to <16 x i32>
1016 %x1 = bitcast <8 x i64> %s1 to <16 x i32>
1017 %r = shufflevector <16 x i32> %x0, <16 x i32> %x1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>