1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7
11 define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
12 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2:
13 ; AVX512F-ONLY: # %bb.0:
14 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
15 ; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
16 ; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
17 ; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %xmm0
18 ; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k1
19 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
20 ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
21 ; AVX512F-ONLY-NEXT: retq
23 ; AVX512DQ-LABEL: mask_replication_factor2_vf2:
25 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
26 ; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0
27 ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %xmm0
28 ; AVX512DQ-NEXT: vpmovd2m %xmm0, %k1
29 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
30 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
33 ; AVX512BW-LABEL: mask_replication_factor2_vf2:
35 ; AVX512BW-NEXT: kmovq (%rdi), %k1
36 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
37 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
38 ; AVX512BW-NEXT: vpmovsxdq %xmm0, %xmm0
39 ; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1
40 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z}
41 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
43 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
44 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
45 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
46 %data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison)
47 %data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
48 store <4 x i32> %data, ptr %out.vec, align 64
52 define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
53 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf4:
54 ; AVX512F-ONLY: # %bb.0:
55 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
56 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
57 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
58 ; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %ymm0
59 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
60 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
61 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
62 ; AVX512F-ONLY-NEXT: vzeroupper
63 ; AVX512F-ONLY-NEXT: retq
65 ; AVX512DQ-LABEL: mask_replication_factor2_vf4:
67 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
68 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
69 ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
70 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1
71 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
72 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
73 ; AVX512DQ-NEXT: vzeroupper
76 ; AVX512BW-LABEL: mask_replication_factor2_vf4:
78 ; AVX512BW-NEXT: kmovw (%rdi), %k1
79 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
80 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
81 ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
82 ; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1
83 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
84 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
85 ; AVX512BW-NEXT: vzeroupper
87 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
88 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
89 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
90 %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
91 %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
92 store <8 x i32> %data, ptr %out.vec, align 64
96 define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
97 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8:
98 ; AVX512F-ONLY: # %bb.0:
99 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
100 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
101 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
102 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
103 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
104 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
105 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
106 ; AVX512F-ONLY-NEXT: vzeroupper
107 ; AVX512F-ONLY-NEXT: retq
109 ; AVX512DQ-LABEL: mask_replication_factor2_vf8:
111 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
112 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
113 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
114 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
115 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
116 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
117 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
118 ; AVX512DQ-NEXT: vzeroupper
119 ; AVX512DQ-NEXT: retq
121 ; AVX512BW-LABEL: mask_replication_factor2_vf8:
123 ; AVX512BW-NEXT: kmovw (%rdi), %k1
124 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
125 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
126 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
127 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
128 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
129 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
130 ; AVX512BW-NEXT: vzeroupper
131 ; AVX512BW-NEXT: retq
132 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
133 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
134 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
135 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
136 store <16 x i32> %data, ptr %out.vec, align 64
140 define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
141 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16:
142 ; AVX512F-ONLY: # %bb.0:
143 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
144 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
145 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
146 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
147 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
148 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
149 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
150 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
151 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
152 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
153 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
154 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
155 ; AVX512F-ONLY-NEXT: vzeroupper
156 ; AVX512F-ONLY-NEXT: retq
158 ; AVX512DQ-LABEL: mask_replication_factor2_vf16:
160 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
161 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
162 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
163 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
164 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
165 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
166 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
167 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
168 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
169 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
170 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
171 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
172 ; AVX512DQ-NEXT: vzeroupper
173 ; AVX512DQ-NEXT: retq
175 ; AVX512BW-LABEL: mask_replication_factor2_vf16:
177 ; AVX512BW-NEXT: kmovw (%rdi), %k0
178 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
179 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
180 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
181 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
182 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
183 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
184 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
185 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
186 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
187 ; AVX512BW-NEXT: vzeroupper
188 ; AVX512BW-NEXT: retq
189 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
190 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
191 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
192 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
193 store <32 x i32> %data, ptr %out.vec, align 64
197 define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
198 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32:
199 ; AVX512F-ONLY: # %bb.0:
200 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
201 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2
202 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
203 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
204 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
205 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2
206 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
207 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
208 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
209 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
210 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
211 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
212 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
213 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
214 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
215 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
216 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
217 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
218 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
219 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
220 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
221 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
222 ; AVX512F-ONLY-NEXT: vzeroupper
223 ; AVX512F-ONLY-NEXT: retq
225 ; AVX512DQ-LABEL: mask_replication_factor2_vf32:
227 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
228 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
229 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
230 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
231 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
232 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
233 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
234 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
235 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
236 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
237 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
238 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
239 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
240 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
241 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
242 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
243 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
244 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
245 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
246 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
247 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
248 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
249 ; AVX512DQ-NEXT: vzeroupper
250 ; AVX512DQ-NEXT: retq
252 ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
253 ; AVX512BW-ONLY: # %bb.0:
254 ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
255 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
256 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
257 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
258 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
259 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
260 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
261 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
262 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
263 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
264 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
265 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
266 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
267 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
268 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
269 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
270 ; AVX512BW-ONLY-NEXT: vzeroupper
271 ; AVX512BW-ONLY-NEXT: retq
273 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
274 ; AVX512VBMI-ONLY: # %bb.0:
275 ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
276 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
277 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
278 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
279 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
280 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
281 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
282 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
283 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
284 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
285 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
286 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
287 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
288 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
289 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
290 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
291 ; AVX512VBMI-ONLY-NEXT: vzeroupper
292 ; AVX512VBMI-ONLY-NEXT: retq
293 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
294 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
295 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
296 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
297 store <64 x i32> %data, ptr %out.vec, align 64
301 define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
302 ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
303 ; AVX512F-ONLY: # %bb.0:
304 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3
305 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5
306 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4
307 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
308 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
309 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
310 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
311 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
312 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
313 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
314 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
315 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
316 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1
317 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
318 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4
319 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
320 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
321 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1
322 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
323 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5
324 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
325 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7
326 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1
327 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
328 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
329 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
330 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
331 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
332 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
333 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z}
334 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
335 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z}
336 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z}
337 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
338 ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
339 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
340 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
341 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
342 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
343 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
344 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
345 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
346 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
347 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
348 ; AVX512F-ONLY-NEXT: vzeroupper
349 ; AVX512F-ONLY-NEXT: retq
351 ; AVX512DQ-LABEL: mask_replication_factor2_vf64:
353 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
354 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k5
355 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k3
356 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1
357 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
358 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
359 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
360 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
361 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
362 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
363 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
364 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
365 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
366 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
367 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3
368 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
369 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
370 ; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0
371 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
372 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5
373 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
374 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
375 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
376 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
377 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
378 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
379 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
380 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
381 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
382 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
383 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
384 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
385 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
386 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
387 ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
388 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
389 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
390 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
391 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
392 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
393 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
394 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
395 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
396 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
397 ; AVX512DQ-NEXT: vzeroupper
398 ; AVX512DQ-NEXT: retq
400 ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64:
401 ; AVX512BW-ONLY: # %bb.0:
402 ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
403 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
404 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
405 ; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
406 ; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
407 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
408 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
409 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
410 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
411 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
412 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
413 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
414 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
415 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
416 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
417 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
418 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
419 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
420 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
421 ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
422 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
423 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
424 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
425 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
426 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
427 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
428 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
429 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
430 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
431 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
432 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
433 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
434 ; AVX512BW-ONLY-NEXT: vzeroupper
435 ; AVX512BW-ONLY-NEXT: retq
437 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64:
438 ; AVX512VBMI-ONLY: # %bb.0:
439 ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
440 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
441 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
442 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
443 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
444 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
445 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
446 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
447 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
448 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
449 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
450 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
451 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
452 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
453 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
454 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
455 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
456 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
457 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
458 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
459 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
460 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
461 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
462 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
463 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
464 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
465 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
466 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
467 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
468 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
469 ; AVX512VBMI-ONLY-NEXT: vzeroupper
470 ; AVX512VBMI-ONLY-NEXT: retq
471 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
472 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63>
473 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
474 store <128 x i32> %data, ptr %out.vec, align 64
478 define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
479 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2:
480 ; AVX512F-ONLY: # %bb.0:
481 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
482 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
483 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
484 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
485 ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
486 ; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0
487 ; AVX512F-ONLY-NEXT: movb $63, %al
488 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
489 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1}
490 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
491 ; AVX512F-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
492 ; AVX512F-ONLY-NEXT: vmovq %xmm1, 16(%rdx)
493 ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
494 ; AVX512F-ONLY-NEXT: vzeroupper
495 ; AVX512F-ONLY-NEXT: retq
497 ; AVX512DQ-LABEL: mask_replication_factor3_vf2:
499 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
500 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
501 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
502 ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
503 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
504 ; AVX512DQ-NEXT: movb $63, %al
505 ; AVX512DQ-NEXT: kmovw %eax, %k1
506 ; AVX512DQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1}
507 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
508 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
509 ; AVX512DQ-NEXT: vmovq %xmm1, 16(%rdx)
510 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
511 ; AVX512DQ-NEXT: vzeroupper
512 ; AVX512DQ-NEXT: retq
514 ; AVX512BW-LABEL: mask_replication_factor3_vf2:
516 ; AVX512BW-NEXT: kmovw (%rdi), %k1
517 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
518 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
519 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0]
520 ; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0
521 ; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
522 ; AVX512BW-NEXT: movb $63, %al
523 ; AVX512BW-NEXT: kmovd %eax, %k1
524 ; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1}
525 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
526 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
527 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rdx)
528 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
529 ; AVX512BW-NEXT: vzeroupper
530 ; AVX512BW-NEXT: retq
531 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
532 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
533 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1>
534 %data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison)
535 %data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
536 store <6 x i32> %data, ptr %out.vec, align 64
540 define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
541 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4:
542 ; AVX512F-ONLY: # %bb.0:
543 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
544 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
545 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
546 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
547 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
548 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
549 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
550 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
551 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
552 ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
553 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
554 ; AVX512F-ONLY-NEXT: vzeroupper
555 ; AVX512F-ONLY-NEXT: retq
557 ; AVX512DQ-LABEL: mask_replication_factor3_vf4:
559 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
560 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
561 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
562 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
563 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
564 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
565 ; AVX512DQ-NEXT: kmovw %eax, %k1
566 ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
567 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
568 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
569 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
570 ; AVX512DQ-NEXT: vzeroupper
571 ; AVX512DQ-NEXT: retq
573 ; AVX512BW-LABEL: mask_replication_factor3_vf4:
575 ; AVX512BW-NEXT: kmovw (%rdi), %k1
576 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
577 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0]
578 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
579 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
580 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF
581 ; AVX512BW-NEXT: kmovd %eax, %k1
582 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
583 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
584 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
585 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
586 ; AVX512BW-NEXT: vzeroupper
587 ; AVX512BW-NEXT: retq
588 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
589 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
590 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
591 %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
592 %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
593 store <12 x i32> %data, ptr %out.vec, align 64
597 define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
598 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8:
599 ; AVX512F-ONLY: # %bb.0:
600 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
601 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
602 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
603 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
604 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
605 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
606 ; AVX512F-ONLY-NEXT: movw $1, %ax
607 ; AVX512F-ONLY-NEXT: kmovw %eax, %k2
608 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
609 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
610 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
611 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
612 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
613 ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
614 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
615 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
616 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
617 ; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx)
618 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
619 ; AVX512F-ONLY-NEXT: vzeroupper
620 ; AVX512F-ONLY-NEXT: retq
622 ; AVX512DQ-LABEL: mask_replication_factor3_vf8:
624 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
625 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
626 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
627 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
628 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
629 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
630 ; AVX512DQ-NEXT: movw $1, %ax
631 ; AVX512DQ-NEXT: kmovw %eax, %k1
632 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
633 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
634 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
635 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7]
636 ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
637 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2
638 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
639 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
640 ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx)
641 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
642 ; AVX512DQ-NEXT: vzeroupper
643 ; AVX512DQ-NEXT: retq
645 ; AVX512BW-LABEL: mask_replication_factor3_vf8:
647 ; AVX512BW-NEXT: kmovw (%rdi), %k0
648 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
649 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0]
650 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
651 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
652 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF
653 ; AVX512BW-NEXT: kmovd %eax, %k1
654 ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
655 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
656 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
657 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
658 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
659 ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx)
660 ; AVX512BW-NEXT: vzeroupper
661 ; AVX512BW-NEXT: retq
662 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
663 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
664 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
665 %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
666 %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
667 store <24 x i32> %data, ptr %out.vec, align 64
671 define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
672 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16:
673 ; AVX512F-ONLY: # %bb.0:
674 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
675 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
676 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
677 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
678 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
679 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
680 ; AVX512F-ONLY-NEXT: movw $1, %ax
681 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
682 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
683 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
684 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
685 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
686 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
687 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
688 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
689 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
690 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
691 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
692 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
693 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
694 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx)
695 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
696 ; AVX512F-ONLY-NEXT: vzeroupper
697 ; AVX512F-ONLY-NEXT: retq
699 ; AVX512DQ-LABEL: mask_replication_factor3_vf16:
701 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
702 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
703 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
704 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
705 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
706 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
707 ; AVX512DQ-NEXT: movw $1, %ax
708 ; AVX512DQ-NEXT: kmovw %eax, %k1
709 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
710 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
711 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
712 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
713 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
714 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
715 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
716 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
717 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
718 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
719 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
720 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
721 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
722 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
723 ; AVX512DQ-NEXT: vzeroupper
724 ; AVX512DQ-NEXT: retq
726 ; AVX512BW-LABEL: mask_replication_factor3_vf16:
728 ; AVX512BW-NEXT: kmovw (%rdi), %k1
729 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
730 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
731 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
732 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
733 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
734 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
735 ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
736 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
737 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
738 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
739 ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0
740 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
741 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
742 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
743 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
744 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
745 ; AVX512BW-NEXT: vzeroupper
746 ; AVX512BW-NEXT: retq
747 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
748 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
749 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15>
750 %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
751 store <48 x i32> %data, ptr %out.vec, align 64
755 define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
756 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32:
757 ; AVX512F-ONLY: # %bb.0:
758 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2
759 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
760 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
761 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
762 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
763 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2
764 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
765 ; AVX512F-ONLY-NEXT: movw $1, %ax
766 ; AVX512F-ONLY-NEXT: kmovw %eax, %k2
767 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2}
768 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3
769 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
770 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3
771 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2
772 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
773 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0
774 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
775 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
776 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
777 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
778 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1
779 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
780 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0
781 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
782 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z}
783 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
784 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
785 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
786 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z}
787 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z}
788 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
789 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx)
790 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
791 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
792 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
793 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
794 ; AVX512F-ONLY-NEXT: vzeroupper
795 ; AVX512F-ONLY-NEXT: retq
797 ; AVX512DQ-LABEL: mask_replication_factor3_vf32:
799 ; AVX512DQ-NEXT: kmovw (%rdi), %k1
800 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
801 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
802 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
803 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
804 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
805 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
806 ; AVX512DQ-NEXT: movw $1, %ax
807 ; AVX512DQ-NEXT: kmovw %eax, %k1
808 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
809 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2
810 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
811 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3
812 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
813 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
814 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0
815 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
816 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
817 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
818 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
819 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1
820 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
821 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0
822 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
823 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
824 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
825 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
826 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
827 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
828 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
829 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
830 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx)
831 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
832 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
833 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
834 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
835 ; AVX512DQ-NEXT: vzeroupper
836 ; AVX512DQ-NEXT: retq
838 ; AVX512BW-LABEL: mask_replication_factor3_vf32:
840 ; AVX512BW-NEXT: kmovd (%rdi), %k0
841 ; AVX512BW-NEXT: kshiftrd $1, %k0, %k1
842 ; AVX512BW-NEXT: movw $-3, %ax
843 ; AVX512BW-NEXT: kmovd %eax, %k4
844 ; AVX512BW-NEXT: kmovw (%rdi), %k2
845 ; AVX512BW-NEXT: kandw %k4, %k2, %k3
846 ; AVX512BW-NEXT: kmovq %k4, %k7
847 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
848 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
849 ; AVX512BW-NEXT: korw %k4, %k3, %k3
850 ; AVX512BW-NEXT: movw $-5, %ax
851 ; AVX512BW-NEXT: kmovd %eax, %k4
852 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
853 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
854 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
855 ; AVX512BW-NEXT: korw %k2, %k3, %k2
856 ; AVX512BW-NEXT: movw $-9, %ax
857 ; AVX512BW-NEXT: kmovd %eax, %k3
858 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
859 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
860 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
861 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
862 ; AVX512BW-NEXT: korw %k3, %k2, %k2
863 ; AVX512BW-NEXT: movw $-17, %ax
864 ; AVX512BW-NEXT: kmovd %eax, %k5
865 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
866 ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
867 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
868 ; AVX512BW-NEXT: korw %k3, %k2, %k2
869 ; AVX512BW-NEXT: movw $-33, %ax
870 ; AVX512BW-NEXT: kmovd %eax, %k3
871 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
872 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
873 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
874 ; AVX512BW-NEXT: korw %k1, %k2, %k1
875 ; AVX512BW-NEXT: movw $-65, %ax
876 ; AVX512BW-NEXT: kmovd %eax, %k2
877 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
878 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
879 ; AVX512BW-NEXT: kshiftrd $2, %k0, %k2
880 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
881 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k3
882 ; AVX512BW-NEXT: korw %k3, %k1, %k1
883 ; AVX512BW-NEXT: movw $-129, %ax
884 ; AVX512BW-NEXT: kmovd %eax, %k3
885 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
886 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
887 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3
888 ; AVX512BW-NEXT: korw %k3, %k1, %k1
889 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
890 ; AVX512BW-NEXT: kmovd %eax, %k3
891 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
892 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
893 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
894 ; AVX512BW-NEXT: korw %k2, %k1, %k1
895 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
896 ; AVX512BW-NEXT: kmovd %eax, %k2
897 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
898 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
899 ; AVX512BW-NEXT: kshiftrd $3, %k0, %k2
900 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
901 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k3
902 ; AVX512BW-NEXT: korw %k3, %k1, %k1
903 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
904 ; AVX512BW-NEXT: kmovd %eax, %k3
905 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
906 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
907 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k3
908 ; AVX512BW-NEXT: korw %k3, %k1, %k1
909 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
910 ; AVX512BW-NEXT: kmovd %eax, %k3
911 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
912 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
913 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
914 ; AVX512BW-NEXT: korw %k2, %k1, %k1
915 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
916 ; AVX512BW-NEXT: kmovd %eax, %k2
917 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
918 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
919 ; AVX512BW-NEXT: kshiftrd $4, %k0, %k4
920 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k2
921 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3
922 ; AVX512BW-NEXT: korw %k3, %k1, %k1
923 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
924 ; AVX512BW-NEXT: kmovd %eax, %k6
925 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
926 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
927 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
928 ; AVX512BW-NEXT: korw %k2, %k1, %k1
929 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
930 ; AVX512BW-NEXT: kmovd %eax, %k2
931 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
932 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
933 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
934 ; AVX512BW-NEXT: korw %k4, %k1, %k1
935 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
936 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
937 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2
938 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
939 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
940 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
941 ; AVX512BW-NEXT: korw %k2, %k1, %k1
942 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
943 ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1
944 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k4
945 ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1
946 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
947 ; AVX512BW-NEXT: kmovq %k7, %k2
948 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
949 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
950 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k7
951 ; AVX512BW-NEXT: korw %k7, %k1, %k1
952 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
953 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
954 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k7
955 ; AVX512BW-NEXT: korw %k7, %k1, %k1
956 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
957 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
958 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k4
959 ; AVX512BW-NEXT: korw %k4, %k1, %k1
960 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
961 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k4
962 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
963 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
964 ; AVX512BW-NEXT: korw %k7, %k1, %k1
965 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
966 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
967 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
968 ; AVX512BW-NEXT: korw %k7, %k1, %k1
969 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
970 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
971 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
972 ; AVX512BW-NEXT: korw %k4, %k1, %k1
973 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
974 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
975 ; AVX512BW-NEXT: kshiftrd $29, %k0, %k4
976 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
977 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
978 ; AVX512BW-NEXT: korw %k7, %k1, %k1
979 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
980 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
981 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
982 ; AVX512BW-NEXT: korw %k7, %k1, %k1
983 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
984 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
985 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
986 ; AVX512BW-NEXT: korw %k4, %k1, %k1
987 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
988 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
989 ; AVX512BW-NEXT: kshiftrd $30, %k0, %k4
990 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
991 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k7
992 ; AVX512BW-NEXT: korw %k7, %k1, %k1
993 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
994 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
995 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
996 ; AVX512BW-NEXT: korw %k7, %k1, %k1
997 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
998 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
999 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
1000 ; AVX512BW-NEXT: korw %k4, %k1, %k1
1001 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1002 ; AVX512BW-NEXT: kshiftrd $31, %k0, %k4
1003 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k7
1004 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
1005 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1006 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1007 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1008 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
1009 ; AVX512BW-NEXT: korw %k4, %k1, %k1
1010 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1011 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1012 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1013 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
1014 ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1
1015 ; AVX512BW-NEXT: kandw %k2, %k1, %k6
1016 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1017 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1018 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
1019 ; AVX512BW-NEXT: korw %k1, %k6, %k1
1020 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1021 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1022 ; AVX512BW-NEXT: kshiftrd $22, %k0, %k6
1023 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1024 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
1025 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1026 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1027 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1028 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
1029 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1030 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1031 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1032 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
1033 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1034 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1035 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1036 ; AVX512BW-NEXT: kshiftrd $23, %k0, %k6
1037 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1038 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
1039 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1040 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1041 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
1042 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1043 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1044 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1045 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
1046 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1047 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1048 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1049 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
1050 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1051 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
1052 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1053 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1054 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1055 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
1056 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1057 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1058 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1059 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
1060 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1061 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1062 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1063 ; AVX512BW-NEXT: kshiftrd $25, %k0, %k6
1064 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1065 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
1066 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1067 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1068 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1069 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1070 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1071 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1072 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1073 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1074 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1075 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1076 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1077 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
1078 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k6
1079 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1080 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1081 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1082 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1083 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1084 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
1085 ; AVX512BW-NEXT: kshiftrd $16, %k0, %k1
1086 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1087 ; AVX512BW-NEXT: kandw %k2, %k1, %k2
1088 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1089 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
1090 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1091 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1092 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1093 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
1094 ; AVX512BW-NEXT: korw %k1, %k2, %k1
1095 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1096 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
1097 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1098 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
1099 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1100 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1101 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1102 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
1103 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1104 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1105 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
1106 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1107 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1108 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1109 ; AVX512BW-NEXT: kshiftrd $18, %k0, %k2
1110 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1111 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
1112 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1113 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1114 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
1115 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1116 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1117 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1118 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
1119 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1120 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1121 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1122 ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
1123 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1124 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
1125 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1126 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1127 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1128 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
1129 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1130 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1131 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1132 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
1133 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1134 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1135 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1136 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k2
1137 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
1138 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1139 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1140 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1141 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1142 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1143 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1144 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1145 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1146 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
1147 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1148 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1149 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1150 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1151 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1152 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
1153 ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1
1154 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
1155 ; AVX512BW-NEXT: kshiftrd $10, %k0, %k4
1156 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1157 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1158 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
1159 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
1160 ; AVX512BW-NEXT: korw %k6, %k4, %k4
1161 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1162 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
1163 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
1164 ; AVX512BW-NEXT: korw %k6, %k4, %k4
1165 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1166 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
1167 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
1168 ; AVX512BW-NEXT: korw %k2, %k4, %k2
1169 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1170 ; AVX512BW-NEXT: kshiftrd $12, %k0, %k4
1171 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1172 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
1173 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1174 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1175 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1176 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
1177 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1178 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1179 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1180 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
1181 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1182 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1183 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1184 ; AVX512BW-NEXT: kshiftrd $13, %k0, %k4
1185 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1186 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k6
1187 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1188 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
1189 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
1190 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1191 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1192 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1193 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
1194 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1195 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1196 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1197 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k4
1198 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1199 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k6
1200 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1201 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1202 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1203 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
1204 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1205 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1206 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
1207 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
1208 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1209 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
1210 ; AVX512BW-NEXT: kshiftrd $15, %k0, %k4
1211 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k6
1212 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
1213 ; AVX512BW-NEXT: korw %k7, %k2, %k2
1214 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1215 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
1216 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
1217 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1218 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
1219 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
1220 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1221 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z}
1222 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
1223 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1224 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1225 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1226 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
1227 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1228 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1229 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1230 ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
1231 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1232 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
1233 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1234 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1235 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1236 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
1237 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1238 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1239 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1240 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k4
1241 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1242 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1243 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1244 ; AVX512BW-NEXT: kshiftrd $7, %k0, %k4
1245 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1246 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
1247 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1248 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1249 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1250 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k6
1251 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1252 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1253 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1254 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
1255 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1256 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1257 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1258 ; AVX512BW-NEXT: kshiftrd $8, %k0, %k4
1259 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
1260 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
1261 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1262 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1263 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k6
1264 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1265 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1266 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1267 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k4
1268 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1269 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0
1270 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1271 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
1272 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k4
1273 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1274 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
1275 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
1276 ; AVX512BW-NEXT: korw %k4, %k2, %k2
1277 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1278 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1279 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
1280 ; AVX512BW-NEXT: korw %k0, %k2, %k0
1281 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
1282 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
1283 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
1284 ; AVX512BW-NEXT: korw %k2, %k0, %k0
1285 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
1286 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
1287 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1288 ; AVX512BW-NEXT: korw %k1, %k0, %k1
1289 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
1290 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1291 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
1292 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
1293 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
1294 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx)
1295 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1296 ; AVX512BW-NEXT: vzeroupper
1297 ; AVX512BW-NEXT: retq
1298 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
1299 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1300 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31>
1301 %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
1302 store <96 x i32> %data, ptr %out.vec, align 64
1306 define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
1307 ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64:
1308 ; AVX512F-ONLY: # %bb.0:
1309 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
1310 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
1311 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
1312 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
1313 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
1314 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
1315 ; AVX512F-ONLY-NEXT: movw $1, %ax
1316 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
1317 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
1318 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
1319 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
1320 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
1321 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
1322 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
1323 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
1324 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
1325 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
1326 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6
1327 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
1328 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0
1329 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8
1330 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9
1331 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3
1332 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10
1333 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11
1334 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4
1335 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1
1336 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2
1337 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5
1338 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z}
1339 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
1340 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
1341 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
1342 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
1343 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
1344 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
1345 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
1346 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
1347 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
1348 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
1349 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
1350 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
1351 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
1352 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
1353 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
1354 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
1355 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
1356 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
1357 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
1358 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
1359 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
1360 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
1361 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx)
1362 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx)
1363 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
1364 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx)
1365 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx)
1366 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx)
1367 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx)
1368 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx)
1369 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx)
1370 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx)
1371 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx)
1372 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx)
1373 ; AVX512F-ONLY-NEXT: vzeroupper
1374 ; AVX512F-ONLY-NEXT: retq
1376 ; AVX512DQ-LABEL: mask_replication_factor3_vf64:
1377 ; AVX512DQ: # %bb.0:
1378 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
1379 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
1380 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5]
1381 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
1382 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
1383 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
1384 ; AVX512DQ-NEXT: movw $1, %ax
1385 ; AVX512DQ-NEXT: kmovw %eax, %k1
1386 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
1387 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
1388 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
1389 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
1390 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
1391 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
1392 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
1393 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
1394 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10]
1395 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6
1396 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15]
1397 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0
1398 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8
1399 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9
1400 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3
1401 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10
1402 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11
1403 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4
1404 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1
1405 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2
1406 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5
1407 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z}
1408 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
1409 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z}
1410 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
1411 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
1412 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
1413 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z}
1414 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
1415 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
1416 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
1417 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z}
1418 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
1419 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z}
1420 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
1421 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
1422 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
1423 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z}
1424 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
1425 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
1426 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
1427 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z}
1428 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
1429 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
1430 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx)
1431 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx)
1432 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
1433 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx)
1434 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx)
1435 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx)
1436 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx)
1437 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx)
1438 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx)
1439 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx)
1440 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx)
1441 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx)
1442 ; AVX512DQ-NEXT: vzeroupper
1443 ; AVX512DQ-NEXT: retq
1445 ; AVX512BW-LABEL: mask_replication_factor3_vf64:
1446 ; AVX512BW: # %bb.0:
1447 ; AVX512BW-NEXT: kmovq (%rdi), %k0
1448 ; AVX512BW-NEXT: kshiftrq $1, %k0, %k1
1449 ; AVX512BW-NEXT: movw $-3, %ax
1450 ; AVX512BW-NEXT: kmovd %eax, %k4
1451 ; AVX512BW-NEXT: kmovw (%rdi), %k2
1452 ; AVX512BW-NEXT: kandw %k4, %k2, %k3
1453 ; AVX512BW-NEXT: kmovq %k4, %k7
1454 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1455 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
1456 ; AVX512BW-NEXT: korw %k4, %k3, %k3
1457 ; AVX512BW-NEXT: movw $-5, %ax
1458 ; AVX512BW-NEXT: kmovd %eax, %k4
1459 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1460 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
1461 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
1462 ; AVX512BW-NEXT: korw %k2, %k3, %k2
1463 ; AVX512BW-NEXT: movw $-9, %ax
1464 ; AVX512BW-NEXT: kmovd %eax, %k3
1465 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1466 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1467 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1468 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
1469 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1470 ; AVX512BW-NEXT: movw $-17, %ax
1471 ; AVX512BW-NEXT: kmovd %eax, %k3
1472 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1473 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1474 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
1475 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1476 ; AVX512BW-NEXT: movw $-33, %ax
1477 ; AVX512BW-NEXT: kmovd %eax, %k3
1478 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1479 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1480 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
1481 ; AVX512BW-NEXT: korw %k1, %k2, %k1
1482 ; AVX512BW-NEXT: movw $-65, %ax
1483 ; AVX512BW-NEXT: kmovd %eax, %k2
1484 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1485 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1486 ; AVX512BW-NEXT: kshiftrq $2, %k0, %k2
1487 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1488 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k3
1489 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1490 ; AVX512BW-NEXT: movw $-129, %ax
1491 ; AVX512BW-NEXT: kmovd %eax, %k3
1492 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1493 ; AVX512BW-NEXT: kmovq %k3, %k5
1494 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1495 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3
1496 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1497 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
1498 ; AVX512BW-NEXT: kmovd %eax, %k3
1499 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1500 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1501 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
1502 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1503 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
1504 ; AVX512BW-NEXT: kmovd %eax, %k2
1505 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1506 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1507 ; AVX512BW-NEXT: kshiftrq $3, %k0, %k2
1508 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1509 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k3
1510 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1511 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
1512 ; AVX512BW-NEXT: kmovd %eax, %k3
1513 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1514 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1515 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k3
1516 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1517 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
1518 ; AVX512BW-NEXT: kmovd %eax, %k3
1519 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1520 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1521 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
1522 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1523 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
1524 ; AVX512BW-NEXT: kmovd %eax, %k2
1525 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1526 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1527 ; AVX512BW-NEXT: kshiftrq $4, %k0, %k2
1528 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k3
1529 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k4
1530 ; AVX512BW-NEXT: korw %k4, %k1, %k1
1531 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
1532 ; AVX512BW-NEXT: kmovd %eax, %k6
1533 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1534 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1535 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k3
1536 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1537 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
1538 ; AVX512BW-NEXT: kmovd %eax, %k3
1539 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1540 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1541 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
1542 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1543 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1544 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1545 ; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
1546 ; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1547 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1548 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1549 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1550 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
1551 ; AVX512BW-NEXT: kshiftrq $59, %k0, %k1
1552 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
1553 ; AVX512BW-NEXT: kshiftrq $58, %k0, %k1
1554 ; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1555 ; AVX512BW-NEXT: kmovq %k7, %k3
1556 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1557 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1558 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
1559 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1560 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1561 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1562 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k7
1563 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1564 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1565 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1566 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
1567 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1568 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1569 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1570 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k2
1571 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1572 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k7
1573 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1574 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1575 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1576 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k7
1577 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1578 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1579 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1580 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k2
1581 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1582 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1583 ; AVX512BW-NEXT: kshiftrq $61, %k0, %k2
1584 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1585 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k7
1586 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1587 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1588 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1589 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k7
1590 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1591 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1592 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1593 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
1594 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1595 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1596 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1597 ; AVX512BW-NEXT: kshiftrq $62, %k0, %k2
1598 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1599 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k7
1600 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1601 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1602 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1603 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k7
1604 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1605 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1606 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1607 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
1608 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1609 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1610 ; AVX512BW-NEXT: kshiftrq $63, %k0, %k2
1611 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
1612 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
1613 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1614 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1615 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1616 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
1617 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1618 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1619 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1620 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1621 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
1622 ; AVX512BW-NEXT: kshiftrq $53, %k0, %k1
1623 ; AVX512BW-NEXT: kandw %k3, %k1, %k6
1624 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1625 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1626 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
1627 ; AVX512BW-NEXT: korw %k1, %k6, %k1
1628 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1629 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1630 ; AVX512BW-NEXT: kshiftrq $54, %k0, %k6
1631 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1632 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
1633 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1634 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1635 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1636 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
1637 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1638 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1639 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
1640 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1641 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1642 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1643 ; AVX512BW-NEXT: kshiftrq $55, %k0, %k6
1644 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1645 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
1646 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1647 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1648 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1649 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
1650 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1651 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1652 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1653 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
1654 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1655 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1656 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k6
1657 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1658 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
1659 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1660 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1661 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1662 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
1663 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1664 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1665 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1666 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
1667 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1668 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1669 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1670 ; AVX512BW-NEXT: kshiftrq $57, %k0, %k6
1671 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1672 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
1673 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1674 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1675 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1676 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1677 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1678 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1679 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1680 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1681 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1682 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1683 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1684 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload
1685 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6
1686 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1687 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1688 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1689 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1690 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1691 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
1692 ; AVX512BW-NEXT: kshiftrq $48, %k0, %k1
1693 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1694 ; AVX512BW-NEXT: kandw %k3, %k1, %k3
1695 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1696 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
1697 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1698 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1699 ; AVX512BW-NEXT: kandw %k6, %k3, %k3
1700 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
1701 ; AVX512BW-NEXT: korw %k1, %k3, %k1
1702 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1703 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1704 ; AVX512BW-NEXT: kshiftrq $49, %k0, %k3
1705 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1706 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
1707 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1708 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1709 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1710 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
1711 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1712 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1713 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1714 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
1715 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1716 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1717 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
1718 ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3
1719 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1720 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
1721 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1722 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1723 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
1724 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1725 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1726 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1727 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
1728 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1729 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1730 ; AVX512BW-NEXT: kshiftrq $51, %k0, %k3
1731 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1732 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
1733 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1734 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1735 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1736 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
1737 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1738 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1739 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1740 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
1741 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1742 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1743 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1744 ; AVX512BW-NEXT: kshiftrq $52, %k0, %k3
1745 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
1746 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1747 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1748 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1749 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1750 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1751 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1752 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1753 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
1754 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1755 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1756 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1757 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1758 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1759 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
1760 ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1
1761 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
1762 ; AVX512BW-NEXT: kshiftrq $42, %k0, %k1
1763 ; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1764 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1765 ; AVX512BW-NEXT: kandw %k4, %k1, %k3
1766 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
1767 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1768 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1769 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1770 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
1771 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1772 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1773 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1774 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
1775 ; AVX512BW-NEXT: korw %k2, %k3, %k2
1776 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1777 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1778 ; AVX512BW-NEXT: kshiftrq $44, %k0, %k3
1779 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1780 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
1781 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1782 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1783 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1784 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
1785 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1786 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1787 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1788 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
1789 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1790 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1791 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
1792 ; AVX512BW-NEXT: kshiftrq $45, %k0, %k3
1793 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1794 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
1795 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1796 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1797 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1798 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
1799 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1800 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1801 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1802 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
1803 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1804 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
1805 ; AVX512BW-NEXT: kshiftrq $46, %k0, %k3
1806 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1807 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
1808 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1809 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1810 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
1811 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
1812 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1813 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1814 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
1815 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
1816 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1817 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
1818 ; AVX512BW-NEXT: kshiftrq $47, %k0, %k3
1819 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
1820 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
1821 ; AVX512BW-NEXT: korw %k7, %k2, %k2
1822 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1823 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
1824 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
1825 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1826 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
1827 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
1828 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1829 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z}
1830 ; AVX512BW-NEXT: kshiftrq $37, %k0, %k2
1831 ; AVX512BW-NEXT: kandw %k4, %k2, %k3
1832 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
1833 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
1834 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
1835 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1836 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1837 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
1838 ; AVX512BW-NEXT: kshiftrq $38, %k0, %k6
1839 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1840 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
1841 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1842 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1843 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
1844 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
1845 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1846 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1847 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
1848 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
1849 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1850 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1851 ; AVX512BW-NEXT: kshiftrq $39, %k0, %k6
1852 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1853 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
1854 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1855 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1856 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
1857 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
1858 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1859 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1860 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1861 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
1862 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1863 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1864 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1865 ; AVX512BW-NEXT: kshiftrq $40, %k0, %k6
1866 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1867 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
1868 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1869 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1870 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1871 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
1872 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1873 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1874 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1875 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
1876 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1877 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
1878 ; AVX512BW-NEXT: kshiftrq $41, %k0, %k6
1879 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
1880 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
1881 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1882 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1883 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
1884 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1885 ; AVX512BW-NEXT: korw %k7, %k3, %k3
1886 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1887 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1888 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1889 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1890 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1891 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1892 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
1893 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
1894 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1895 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
1896 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
1897 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1898 ; AVX512BW-NEXT: korw %k1, %k3, %k1
1899 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
1900 ; AVX512BW-NEXT: kshiftrq $32, %k0, %k1
1901 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
1902 ; AVX512BW-NEXT: kandw %k3, %k1, %k3
1903 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
1904 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
1905 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1906 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1907 ; AVX512BW-NEXT: kandw %k6, %k3, %k3
1908 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
1909 ; AVX512BW-NEXT: korw %k1, %k3, %k1
1910 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1911 ; AVX512BW-NEXT: kshiftrq $33, %k0, %k3
1912 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1913 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
1914 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1915 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1916 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
1917 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
1918 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1919 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
1920 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
1921 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
1922 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1923 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1924 ; AVX512BW-NEXT: kshiftrq $34, %k0, %k3
1925 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1926 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
1927 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1928 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1929 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1930 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
1931 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1932 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1933 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1934 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
1935 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1936 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1937 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1938 ; AVX512BW-NEXT: kshiftrq $35, %k0, %k3
1939 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1940 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
1941 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1942 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1943 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1944 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
1945 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1946 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1947 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
1948 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
1949 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1950 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1951 ; AVX512BW-NEXT: kshiftrq $36, %k0, %k3
1952 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
1953 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
1954 ; AVX512BW-NEXT: korw %k7, %k1, %k1
1955 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
1956 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
1957 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
1958 ; AVX512BW-NEXT: korw %k6, %k1, %k1
1959 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
1960 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
1961 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
1962 ; AVX512BW-NEXT: korw %k3, %k1, %k1
1963 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
1964 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
1965 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
1966 ; AVX512BW-NEXT: korw %k2, %k1, %k1
1967 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
1968 ; AVX512BW-NEXT: kshiftrq $27, %k0, %k1
1969 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
1970 ; AVX512BW-NEXT: kshiftrq $26, %k0, %k3
1971 ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1972 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1973 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1974 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
1975 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1976 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1977 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1978 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
1979 ; AVX512BW-NEXT: korw %k6, %k3, %k3
1980 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1981 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
1982 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
1983 ; AVX512BW-NEXT: korw %k2, %k3, %k2
1984 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1985 ; AVX512BW-NEXT: kshiftrq $28, %k0, %k3
1986 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
1987 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
1988 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1989 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
1990 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
1991 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
1992 ; AVX512BW-NEXT: korw %k6, %k2, %k2
1993 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1994 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1995 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
1996 ; AVX512BW-NEXT: korw %k3, %k2, %k2
1997 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
1998 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
1999 ; AVX512BW-NEXT: kshiftrq $29, %k0, %k3
2000 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2001 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
2002 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2003 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2004 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2005 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
2006 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2007 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2008 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2009 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
2010 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2011 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2012 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2013 ; AVX512BW-NEXT: kshiftrq $30, %k0, %k3
2014 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2015 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
2016 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2017 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2018 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2019 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
2020 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2021 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2022 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2023 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
2024 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2025 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
2026 ; AVX512BW-NEXT: kshiftrq $31, %k0, %k3
2027 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
2028 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
2029 ; AVX512BW-NEXT: korw %k7, %k2, %k2
2030 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
2031 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
2032 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2033 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
2034 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
2035 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2036 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
2037 ; AVX512BW-NEXT: kshiftrq $21, %k0, %k2
2038 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2039 ; AVX512BW-NEXT: kandw %k5, %k2, %k3
2040 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
2041 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2042 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
2043 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2044 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2045 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
2046 ; AVX512BW-NEXT: kshiftrq $22, %k0, %k6
2047 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
2048 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
2049 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2050 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2051 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
2052 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
2053 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2054 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2055 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
2056 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
2057 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2058 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
2059 ; AVX512BW-NEXT: kshiftrq $23, %k0, %k6
2060 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
2061 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
2062 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2063 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
2064 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
2065 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
2066 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2067 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2068 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
2069 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
2070 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2071 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2072 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
2073 ; AVX512BW-NEXT: kshiftrq $24, %k0, %k6
2074 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
2075 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
2076 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2077 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2078 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
2079 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
2080 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2081 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2082 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
2083 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2084 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2085 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2086 ; AVX512BW-NEXT: kshiftrq $25, %k0, %k6
2087 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
2088 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
2089 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2090 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2091 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2092 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
2093 ; AVX512BW-NEXT: korw %k7, %k3, %k3
2094 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2095 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2096 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
2097 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2098 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2099 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2100 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
2101 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
2102 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2103 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
2104 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
2105 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
2106 ; AVX512BW-NEXT: korw %k1, %k3, %k1
2107 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
2108 ; AVX512BW-NEXT: kshiftrq $16, %k0, %k1
2109 ; AVX512BW-NEXT: kandw %k5, %k1, %k3
2110 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
2111 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
2112 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2113 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2114 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
2115 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
2116 ; AVX512BW-NEXT: korw %k1, %k3, %k1
2117 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2118 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
2119 ; AVX512BW-NEXT: kshiftrq $17, %k0, %k3
2120 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2121 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
2122 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2123 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2124 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
2125 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
2126 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2127 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2128 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
2129 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
2130 ; AVX512BW-NEXT: korw %k3, %k1, %k1
2131 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
2132 ; AVX512BW-NEXT: kshiftrq $18, %k0, %k3
2133 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2134 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
2135 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2136 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
2137 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
2138 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
2139 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2140 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
2141 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
2142 ; AVX512BW-NEXT: korw %k3, %k1, %k1
2143 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2144 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
2145 ; AVX512BW-NEXT: kshiftrq $19, %k0, %k3
2146 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2147 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
2148 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2149 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2150 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
2151 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
2152 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2153 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2154 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
2155 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
2156 ; AVX512BW-NEXT: korw %k3, %k1, %k1
2157 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2158 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
2159 ; AVX512BW-NEXT: kshiftrq $20, %k0, %k3
2160 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
2161 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
2162 ; AVX512BW-NEXT: korw %k7, %k1, %k1
2163 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2164 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
2165 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
2166 ; AVX512BW-NEXT: korw %k6, %k1, %k1
2167 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2168 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
2169 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
2170 ; AVX512BW-NEXT: korw %k3, %k1, %k1
2171 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
2172 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
2173 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
2174 ; AVX512BW-NEXT: korw %k2, %k1, %k1
2175 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
2176 ; AVX512BW-NEXT: kshiftrq $11, %k0, %k1
2177 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
2178 ; AVX512BW-NEXT: kshiftrq $10, %k0, %k3
2179 ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2180 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2181 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2182 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
2183 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2184 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2185 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2186 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
2187 ; AVX512BW-NEXT: korw %k6, %k3, %k3
2188 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2189 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
2190 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
2191 ; AVX512BW-NEXT: korw %k2, %k3, %k2
2192 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2193 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2194 ; AVX512BW-NEXT: kshiftrq $12, %k0, %k3
2195 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2196 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
2197 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2198 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2199 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2200 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
2201 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2202 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2203 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2204 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
2205 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2206 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
2207 ; AVX512BW-NEXT: kshiftrq $13, %k0, %k3
2208 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2209 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
2210 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2211 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2212 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2213 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
2214 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2215 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
2216 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
2217 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2218 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
2219 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
2220 ; AVX512BW-NEXT: kshiftrq $14, %k0, %k3
2221 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2222 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
2223 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2224 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2225 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2226 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
2227 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2228 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
2229 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
2230 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
2231 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2232 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
2233 ; AVX512BW-NEXT: kshiftrq $15, %k0, %k3
2234 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
2235 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
2236 ; AVX512BW-NEXT: korw %k7, %k2, %k2
2237 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
2238 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
2239 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
2240 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2241 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
2242 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
2243 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2244 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z}
2245 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
2246 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2247 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
2248 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2249 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k3
2250 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2251 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2252 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
2253 ; AVX512BW-NEXT: kshiftrq $6, %k0, %k3
2254 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2255 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k6
2256 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2257 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2258 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2259 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
2260 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2261 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2262 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2263 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k3
2264 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2265 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2266 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
2267 ; AVX512BW-NEXT: kshiftrq $7, %k0, %k3
2268 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2269 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
2270 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2271 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2272 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2273 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
2274 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2275 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2276 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2277 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
2278 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2279 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
2280 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
2281 ; AVX512BW-NEXT: kshiftrq $8, %k0, %k3
2282 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
2283 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
2284 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2285 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
2286 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
2287 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
2288 ; AVX512BW-NEXT: korw %k6, %k2, %k2
2289 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
2290 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
2291 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2292 ; AVX512BW-NEXT: kshiftrq $9, %k0, %k0
2293 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2294 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
2295 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k3
2296 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2297 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
2298 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3
2299 ; AVX512BW-NEXT: korw %k3, %k2, %k2
2300 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2301 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
2302 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
2303 ; AVX512BW-NEXT: korw %k0, %k2, %k0
2304 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
2305 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
2306 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
2307 ; AVX512BW-NEXT: korw %k2, %k0, %k0
2308 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
2309 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
2310 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
2311 ; AVX512BW-NEXT: korw %k1, %k0, %k1
2312 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
2313 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
2314 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx)
2315 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx)
2316 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx)
2317 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
2318 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
2319 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx)
2320 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx)
2321 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx)
2322 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
2323 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx)
2324 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
2325 ; AVX512BW-NEXT: vzeroupper
2326 ; AVX512BW-NEXT: retq
2327 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
2328 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63>
2329 %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
2330 store <192 x i32> %data, ptr %out.vec, align 64
2334 define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2335 ; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2:
2336 ; AVX512F-SLOW: # %bb.0:
2337 ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
2338 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2339 ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2340 ; AVX512F-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
2341 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2342 ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
2343 ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2344 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
2345 ; AVX512F-SLOW-NEXT: vzeroupper
2346 ; AVX512F-SLOW-NEXT: retq
2348 ; AVX512F-FAST-LABEL: mask_replication_factor4_vf2:
2349 ; AVX512F-FAST: # %bb.0:
2350 ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
2351 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2352 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2353 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
2354 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2355 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
2356 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2357 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
2358 ; AVX512F-FAST-NEXT: vzeroupper
2359 ; AVX512F-FAST-NEXT: retq
2361 ; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
2362 ; AVX512DQ-SLOW: # %bb.0:
2363 ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0
2364 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
2365 ; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
2366 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2367 ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1
2368 ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2369 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
2370 ; AVX512DQ-SLOW-NEXT: vzeroupper
2371 ; AVX512DQ-SLOW-NEXT: retq
2373 ; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
2374 ; AVX512DQ-FAST: # %bb.0:
2375 ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0
2376 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
2377 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
2378 ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2379 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1
2380 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2381 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
2382 ; AVX512DQ-FAST-NEXT: vzeroupper
2383 ; AVX512DQ-FAST-NEXT: retq
2385 ; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
2386 ; AVX512BW-SLOW: # %bb.0:
2387 ; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1
2388 ; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2389 ; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2390 ; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
2391 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2392 ; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
2393 ; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2394 ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
2395 ; AVX512BW-SLOW-NEXT: vzeroupper
2396 ; AVX512BW-SLOW-NEXT: retq
2398 ; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
2399 ; AVX512BW-FAST: # %bb.0:
2400 ; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1
2401 ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2402 ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2403 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
2404 ; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2405 ; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
2406 ; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2407 ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx)
2408 ; AVX512BW-FAST-NEXT: vzeroupper
2409 ; AVX512BW-FAST-NEXT: retq
2411 ; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
2412 ; AVX512VBMI-SLOW: # %bb.0:
2413 ; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1
2414 ; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2415 ; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2416 ; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
2417 ; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2418 ; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
2419 ; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2420 ; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
2421 ; AVX512VBMI-SLOW-NEXT: vzeroupper
2422 ; AVX512VBMI-SLOW-NEXT: retq
2424 ; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
2425 ; AVX512VBMI-FAST: # %bb.0:
2426 ; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1
2427 ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
2428 ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2429 ; AVX512VBMI-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
2430 ; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
2431 ; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
2432 ; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z}
2433 ; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx)
2434 ; AVX512VBMI-FAST-NEXT: vzeroupper
2435 ; AVX512VBMI-FAST-NEXT: retq
2436 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
2437 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
2438 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
2439 %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison)
2440 %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2441 store <8 x i32> %data, ptr %out.vec, align 64
2445 define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2446 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4:
2447 ; AVX512F-ONLY: # %bb.0:
2448 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
2449 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2450 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2451 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
2452 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
2453 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
2454 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
2455 ; AVX512F-ONLY-NEXT: vzeroupper
2456 ; AVX512F-ONLY-NEXT: retq
2458 ; AVX512DQ-LABEL: mask_replication_factor4_vf4:
2459 ; AVX512DQ: # %bb.0:
2460 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
2461 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
2462 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2463 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
2464 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
2465 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
2466 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
2467 ; AVX512DQ-NEXT: vzeroupper
2468 ; AVX512DQ-NEXT: retq
2470 ; AVX512BW-LABEL: mask_replication_factor4_vf4:
2471 ; AVX512BW: # %bb.0:
2472 ; AVX512BW-NEXT: kmovw (%rdi), %k1
2473 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2474 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2475 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
2476 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
2477 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
2478 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
2479 ; AVX512BW-NEXT: vzeroupper
2480 ; AVX512BW-NEXT: retq
2481 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
2482 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2483 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
2484 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
2485 store <16 x i32> %data, ptr %out.vec, align 64
2489 define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2490 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8:
2491 ; AVX512F-ONLY: # %bb.0:
2492 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
2493 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2494 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2495 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
2496 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
2497 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2498 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
2499 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
2500 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
2501 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
2502 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
2503 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
2504 ; AVX512F-ONLY-NEXT: vzeroupper
2505 ; AVX512F-ONLY-NEXT: retq
2507 ; AVX512DQ-LABEL: mask_replication_factor4_vf8:
2508 ; AVX512DQ: # %bb.0:
2509 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
2510 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
2511 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2512 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
2513 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
2514 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2515 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
2516 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
2517 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
2518 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
2519 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
2520 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
2521 ; AVX512DQ-NEXT: vzeroupper
2522 ; AVX512DQ-NEXT: retq
2524 ; AVX512BW-LABEL: mask_replication_factor4_vf8:
2525 ; AVX512BW: # %bb.0:
2526 ; AVX512BW-NEXT: kmovw (%rdi), %k0
2527 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
2528 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2529 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
2530 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
2531 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
2532 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
2533 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
2534 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
2535 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
2536 ; AVX512BW-NEXT: vzeroupper
2537 ; AVX512BW-NEXT: retq
2538 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
2539 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2540 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
2541 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
2542 store <32 x i32> %data, ptr %out.vec, align 64
2546 define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2547 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16:
2548 ; AVX512F-ONLY: # %bb.0:
2549 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
2550 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2551 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2552 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
2553 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
2554 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2555 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
2556 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
2557 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2558 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
2559 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
2560 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2561 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
2562 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
2563 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
2564 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
2565 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
2566 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
2567 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2568 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2569 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2570 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2571 ; AVX512F-ONLY-NEXT: vzeroupper
2572 ; AVX512F-ONLY-NEXT: retq
2574 ; AVX512DQ-LABEL: mask_replication_factor4_vf16:
2575 ; AVX512DQ: # %bb.0:
2576 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
2577 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
2578 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2579 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
2580 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
2581 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2582 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
2583 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
2584 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2585 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
2586 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
2587 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2588 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
2589 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
2590 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
2591 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
2592 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
2593 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
2594 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2595 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2596 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
2597 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2598 ; AVX512DQ-NEXT: vzeroupper
2599 ; AVX512DQ-NEXT: retq
2601 ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
2602 ; AVX512BW-ONLY: # %bb.0:
2603 ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
2604 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
2605 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
2606 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
2607 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
2608 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
2609 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
2610 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
2611 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
2612 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
2613 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
2614 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
2615 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2616 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2617 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2618 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2619 ; AVX512BW-ONLY-NEXT: vzeroupper
2620 ; AVX512BW-ONLY-NEXT: retq
2622 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
2623 ; AVX512VBMI-ONLY: # %bb.0:
2624 ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
2625 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
2626 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2627 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
2628 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
2629 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
2630 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
2631 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
2632 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
2633 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
2634 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
2635 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
2636 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2637 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2638 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2639 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2640 ; AVX512VBMI-ONLY-NEXT: vzeroupper
2641 ; AVX512VBMI-ONLY-NEXT: retq
2642 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
2643 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2644 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
2645 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
2646 store <64 x i32> %data, ptr %out.vec, align 64
2650 define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2651 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32:
2652 ; AVX512F-ONLY: # %bb.0:
2653 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4
2654 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
2655 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2656 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2657 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
2658 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
2659 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2660 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2661 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3
2662 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2
2663 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2664 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4
2665 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3
2666 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2667 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0
2668 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5
2669 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1
2670 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
2671 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
2672 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1
2673 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
2674 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1
2675 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7
2676 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0
2677 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
2678 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
2679 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
2680 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
2681 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
2682 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z}
2683 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
2684 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
2685 ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2686 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
2687 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
2688 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
2689 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
2690 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
2691 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2692 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2693 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2694 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2695 ; AVX512F-ONLY-NEXT: vzeroupper
2696 ; AVX512F-ONLY-NEXT: retq
2698 ; AVX512DQ-LABEL: mask_replication_factor4_vf32:
2699 ; AVX512DQ: # %bb.0:
2700 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
2701 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
2702 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
2703 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2704 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
2705 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
2706 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
2707 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2708 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3
2709 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2
2710 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2711 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4
2712 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3
2713 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2714 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0
2715 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
2716 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
2717 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
2718 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
2719 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1
2720 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
2721 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1
2722 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
2723 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0
2724 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
2725 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
2726 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
2727 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
2728 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
2729 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
2730 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
2731 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
2732 ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
2733 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
2734 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
2735 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
2736 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
2737 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
2738 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2739 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2740 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
2741 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2742 ; AVX512DQ-NEXT: vzeroupper
2743 ; AVX512DQ-NEXT: retq
2745 ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32:
2746 ; AVX512BW-ONLY: # %bb.0:
2747 ; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0
2748 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
2749 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
2750 ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2751 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
2752 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
2753 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
2754 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
2755 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
2756 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
2757 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
2758 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
2759 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
2760 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
2761 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
2762 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
2763 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
2764 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
2765 ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
2766 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
2767 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
2768 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
2769 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
2770 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
2771 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
2772 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
2773 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
2774 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2775 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2776 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2777 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2778 ; AVX512BW-ONLY-NEXT: vzeroupper
2779 ; AVX512BW-ONLY-NEXT: retq
2781 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32:
2782 ; AVX512VBMI-ONLY: # %bb.0:
2783 ; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0
2784 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
2785 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
2786 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
2787 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
2788 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2789 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
2790 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
2791 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
2792 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
2793 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
2794 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
2795 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
2796 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
2797 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
2798 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
2799 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
2800 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
2801 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
2802 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
2803 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
2804 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
2805 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
2806 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
2807 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
2808 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
2809 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
2810 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
2811 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
2812 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
2813 ; AVX512VBMI-ONLY-NEXT: vzeroupper
2814 ; AVX512VBMI-ONLY-NEXT: retq
2815 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
2816 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2817 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31>
2818 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
2819 store <128 x i32> %data, ptr %out.vec, align 64
2823 define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
2824 ; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64:
2825 ; AVX512F-ONLY: # %bb.0:
2826 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
2827 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
2828 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
2829 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
2830 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
2831 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
2832 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
2833 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2834 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4
2835 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2836 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
2837 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2838 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
2839 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2840 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0
2841 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10
2842 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11
2843 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm12
2844 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm1
2845 ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm13
2846 ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm14
2847 ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm15
2848 ; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm2
2849 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm16 {%k1} {z} = -1
2850 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3
2851 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5
2852 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7
2853 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9
2854 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
2855 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
2856 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
2857 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
2858 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
2859 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
2860 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
2861 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
2862 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
2863 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
2864 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
2865 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
2866 ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
2867 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
2868 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
2869 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
2870 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
2871 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
2872 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
2873 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
2874 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
2875 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
2876 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
2877 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
2878 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
2879 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
2880 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
2881 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
2882 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
2883 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
2884 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
2885 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
2886 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx)
2887 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx)
2888 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 832(%rdx)
2889 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 768(%rdx)
2890 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
2891 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
2892 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 576(%rdx)
2893 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx)
2894 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 448(%rdx)
2895 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx)
2896 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx)
2897 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
2898 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
2899 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
2900 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx)
2901 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx)
2902 ; AVX512F-ONLY-NEXT: vzeroupper
2903 ; AVX512F-ONLY-NEXT: retq
2905 ; AVX512DQ-LABEL: mask_replication_factor4_vf64:
2906 ; AVX512DQ: # %bb.0:
2907 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
2908 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
2909 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
2910 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
2911 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
2912 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
2913 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
2914 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2915 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4
2916 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11]
2917 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
2918 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
2919 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
2920 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
2921 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0
2922 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10
2923 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11
2924 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm12
2925 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm1
2926 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm13
2927 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm14
2928 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm15
2929 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm2
2930 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16
2931 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3
2932 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5
2933 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7
2934 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9
2935 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
2936 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
2937 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
2938 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z}
2939 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
2940 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
2941 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
2942 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
2943 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
2944 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
2945 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
2946 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
2947 ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
2948 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
2949 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
2950 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z}
2951 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
2952 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
2953 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
2954 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z}
2955 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
2956 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
2957 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
2958 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
2959 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
2960 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z}
2961 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
2962 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z}
2963 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
2964 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
2965 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
2966 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
2967 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx)
2968 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx)
2969 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 832(%rdx)
2970 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 768(%rdx)
2971 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx)
2972 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rdx)
2973 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 576(%rdx)
2974 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx)
2975 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rdx)
2976 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx)
2977 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx)
2978 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
2979 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
2980 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx)
2981 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx)
2982 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
2983 ; AVX512DQ-NEXT: vzeroupper
2984 ; AVX512DQ-NEXT: retq
2986 ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64:
2987 ; AVX512BW-ONLY: # %bb.0:
2988 ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
2989 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
2990 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
2991 ; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
2992 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
2993 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
2994 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
2995 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
2996 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2
2997 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
2998 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
2999 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3
3000 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
3001 ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
3002 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4
3003 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
3004 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
3005 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
3006 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4
3007 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
3008 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
3009 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
3010 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
3011 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
3012 ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
3013 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3
3014 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
3015 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
3016 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
3017 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
3018 ; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
3019 ; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
3020 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
3021 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
3022 ; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
3023 ; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
3024 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
3025 ; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
3026 ; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
3027 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
3028 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
3029 ; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
3030 ; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
3031 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
3032 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
3033 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx)
3034 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx)
3035 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
3036 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
3037 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx)
3038 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx)
3039 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3040 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
3041 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
3042 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
3043 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
3044 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
3045 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
3046 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
3047 ; AVX512BW-ONLY-NEXT: vzeroupper
3048 ; AVX512BW-ONLY-NEXT: retq
3050 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64:
3051 ; AVX512VBMI-ONLY: # %bb.0:
3052 ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
3053 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
3054 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
3055 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
3056 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
3057 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47]
3058 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
3059 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2
3060 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
3061 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
3062 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3
3063 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
3064 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
3065 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4
3066 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
3067 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
3068 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
3069 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4
3070 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
3071 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
3072 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
3073 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
3074 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
3075 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
3076 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3
3077 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
3078 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
3079 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
3080 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
3081 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
3082 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
3083 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
3084 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
3085 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
3086 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
3087 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
3088 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
3089 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
3090 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
3091 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
3092 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
3093 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
3094 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
3095 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
3096 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx)
3097 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx)
3098 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx)
3099 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
3100 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx)
3101 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx)
3102 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
3103 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
3104 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
3105 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
3106 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
3107 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
3108 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
3109 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
3110 ; AVX512VBMI-ONLY-NEXT: vzeroupper
3111 ; AVX512VBMI-ONLY-NEXT: retq
3112 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
3113 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63>
3114 %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
3115 store <256 x i32> %data, ptr %out.vec, align 64
3119 define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
3120 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2:
3121 ; AVX512F-ONLY: # %bb.0:
3122 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
3123 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3124 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
3125 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
3126 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
3127 ; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF
3128 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
3129 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
3130 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3131 ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1
3132 ; AVX512F-ONLY-NEXT: vmovq %xmm1, 32(%rdx)
3133 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
3134 ; AVX512F-ONLY-NEXT: vzeroupper
3135 ; AVX512F-ONLY-NEXT: retq
3137 ; AVX512DQ-LABEL: mask_replication_factor5_vf2:
3138 ; AVX512DQ: # %bb.0:
3139 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
3140 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
3141 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
3142 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
3143 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
3144 ; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF
3145 ; AVX512DQ-NEXT: kmovw %eax, %k1
3146 ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
3147 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3148 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
3149 ; AVX512DQ-NEXT: vmovq %xmm1, 32(%rdx)
3150 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
3151 ; AVX512DQ-NEXT: vzeroupper
3152 ; AVX512DQ-NEXT: retq
3154 ; AVX512BW-LABEL: mask_replication_factor5_vf2:
3155 ; AVX512BW: # %bb.0:
3156 ; AVX512BW-NEXT: kmovw (%rdi), %k1
3157 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3158 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0]
3159 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
3160 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
3161 ; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF
3162 ; AVX512BW-NEXT: kmovd %eax, %k1
3163 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
3164 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3165 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
3166 ; AVX512BW-NEXT: vmovq %xmm1, 32(%rdx)
3167 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
3168 ; AVX512BW-NEXT: vzeroupper
3169 ; AVX512BW-NEXT: retq
3170 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
3171 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
3172 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1>
3173 %data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison)
3174 %data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3175 store <10 x i32> %data, ptr %out.vec, align 64
3179 define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
3180 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4:
3181 ; AVX512F-ONLY: # %bb.0:
3182 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
3183 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3184 ; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3185 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1
3186 ; AVX512F-ONLY-NEXT: movw $15, %ax
3187 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
3188 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
3189 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3190 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
3191 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
3192 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
3193 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
3194 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
3195 ; AVX512F-ONLY-NEXT: vmovdqa %xmm0, 64(%rdx)
3196 ; AVX512F-ONLY-NEXT: vzeroupper
3197 ; AVX512F-ONLY-NEXT: retq
3199 ; AVX512DQ-LABEL: mask_replication_factor5_vf4:
3200 ; AVX512DQ: # %bb.0:
3201 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
3202 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
3203 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3204 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
3205 ; AVX512DQ-NEXT: movw $15, %ax
3206 ; AVX512DQ-NEXT: kmovw %eax, %k1
3207 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
3208 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3209 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
3210 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
3211 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
3212 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
3213 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
3214 ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%rdx)
3215 ; AVX512DQ-NEXT: vzeroupper
3216 ; AVX512DQ-NEXT: retq
3218 ; AVX512BW-LABEL: mask_replication_factor5_vf4:
3219 ; AVX512BW: # %bb.0:
3220 ; AVX512BW-NEXT: kmovd (%rdi), %k0
3221 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
3222 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0]
3223 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
3224 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
3225 ; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF
3226 ; AVX512BW-NEXT: kmovd %eax, %k1
3227 ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
3228 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
3229 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
3230 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
3231 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
3232 ; AVX512BW-NEXT: vmovdqa %xmm0, 64(%rdx)
3233 ; AVX512BW-NEXT: vzeroupper
3234 ; AVX512BW-NEXT: retq
3235 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
3236 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3237 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3>
3238 %data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison)
3239 %data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3240 store <20 x i32> %data, ptr %out.vec, align 64
3244 define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
3245 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8:
3246 ; AVX512F-ONLY: # %bb.0:
3247 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
3248 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3249 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3250 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
3251 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
3252 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
3253 ; AVX512F-ONLY-NEXT: movw $1, %ax
3254 ; AVX512F-ONLY-NEXT: kmovw %eax, %k2
3255 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
3256 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
3257 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3258 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
3259 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
3260 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
3261 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
3262 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
3263 ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0
3264 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1
3265 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
3266 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z}
3267 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z}
3268 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
3269 ; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
3270 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
3271 ; AVX512F-ONLY-NEXT: vzeroupper
3272 ; AVX512F-ONLY-NEXT: retq
3274 ; AVX512DQ-LABEL: mask_replication_factor5_vf8:
3275 ; AVX512DQ: # %bb.0:
3276 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
3277 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
3278 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3279 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
3280 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
3281 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
3282 ; AVX512DQ-NEXT: movw $1, %ax
3283 ; AVX512DQ-NEXT: kmovw %eax, %k1
3284 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
3285 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
3286 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3287 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
3288 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
3289 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
3290 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7]
3291 ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
3292 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3
3293 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3294 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
3295 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
3296 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
3297 ; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx)
3298 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
3299 ; AVX512DQ-NEXT: vzeroupper
3300 ; AVX512DQ-NEXT: retq
3302 ; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8:
3303 ; AVX512BW-ONLY: # %bb.0:
3304 ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0
3305 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
3306 ; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0
3307 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3308 ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
3309 ; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
3310 ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
3311 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
3312 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3313 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
3314 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
3315 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
3316 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
3317 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
3318 ; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
3319 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
3320 ; AVX512BW-ONLY-NEXT: vzeroupper
3321 ; AVX512BW-ONLY-NEXT: retq
3323 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8:
3324 ; AVX512VBMI-ONLY: # %bb.0:
3325 ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0
3326 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
3327 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3328 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
3329 ; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
3330 ; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
3331 ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
3332 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
3333 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3334 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
3335 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
3336 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
3337 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
3338 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
3339 ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
3340 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
3341 ; AVX512VBMI-ONLY-NEXT: vzeroupper
3342 ; AVX512VBMI-ONLY-NEXT: retq
3343 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
3344 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3345 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7>
3346 %data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison)
3347 %data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3348 store <40 x i32> %data, ptr %out.vec, align 64
3352 define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
3353 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16:
3354 ; AVX512F-ONLY: # %bb.0:
3355 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
3356 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3357 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3358 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
3359 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
3360 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
3361 ; AVX512F-ONLY-NEXT: movw $1, %ax
3362 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
3363 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
3364 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
3365 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3366 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
3367 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
3368 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
3369 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
3370 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
3371 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
3372 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
3373 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
3374 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
3375 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
3376 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5
3377 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3378 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
3379 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
3380 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
3381 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
3382 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
3383 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
3384 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
3385 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx)
3386 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
3387 ; AVX512F-ONLY-NEXT: vzeroupper
3388 ; AVX512F-ONLY-NEXT: retq
3390 ; AVX512DQ-LABEL: mask_replication_factor5_vf16:
3391 ; AVX512DQ: # %bb.0:
3392 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
3393 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
3394 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3395 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
3396 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
3397 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
3398 ; AVX512DQ-NEXT: movw $1, %ax
3399 ; AVX512DQ-NEXT: kmovw %eax, %k1
3400 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
3401 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
3402 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3403 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
3404 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
3405 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
3406 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
3407 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
3408 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
3409 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
3410 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
3411 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
3412 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
3413 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5
3414 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3415 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z}
3416 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z}
3417 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
3418 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z}
3419 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
3420 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
3421 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
3422 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx)
3423 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
3424 ; AVX512DQ-NEXT: vzeroupper
3425 ; AVX512DQ-NEXT: retq
3427 ; AVX512BW-LABEL: mask_replication_factor5_vf16:
3428 ; AVX512BW: # %bb.0:
3429 ; AVX512BW-NEXT: kmovw (%rdi), %k1
3430 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3431 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3432 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
3433 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
3434 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
3435 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
3436 ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
3437 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
3438 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z}
3439 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
3440 ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
3441 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
3442 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
3443 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
3444 ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
3445 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
3446 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z}
3447 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3448 ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0
3449 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
3450 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
3451 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
3452 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx)
3453 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
3454 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
3455 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
3456 ; AVX512BW-NEXT: vzeroupper
3457 ; AVX512BW-NEXT: retq
3458 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
3459 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3460 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15>
3461 %data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison)
3462 store <80 x i32> %data, ptr %out.vec, align 64
3466 define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
3467 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32:
3468 ; AVX512F-ONLY: # %bb.0:
3469 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
3470 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
3471 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3472 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
3473 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
3474 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
3475 ; AVX512F-ONLY-NEXT: movw $1, %ax
3476 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
3477 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
3478 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
3479 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
3480 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
3481 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3482 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
3483 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
3484 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
3485 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
3486 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
3487 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
3488 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0
3489 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
3490 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
3491 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
3492 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
3493 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3
3494 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
3495 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
3496 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
3497 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
3498 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
3499 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
3500 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
3501 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
3502 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
3503 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
3504 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
3505 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
3506 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
3507 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
3508 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
3509 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
3510 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
3511 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
3512 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
3513 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
3514 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
3515 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
3516 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx)
3517 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
3518 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx)
3519 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx)
3520 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx)
3521 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx)
3522 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx)
3523 ; AVX512F-ONLY-NEXT: vzeroupper
3524 ; AVX512F-ONLY-NEXT: retq
3526 ; AVX512DQ-LABEL: mask_replication_factor5_vf32:
3527 ; AVX512DQ: # %bb.0:
3528 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
3529 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
3530 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
3531 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
3532 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
3533 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
3534 ; AVX512DQ-NEXT: movw $1, %ax
3535 ; AVX512DQ-NEXT: kmovw %eax, %k1
3536 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
3537 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
3538 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
3539 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
3540 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
3541 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
3542 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
3543 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
3544 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
3545 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
3546 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
3547 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0
3548 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
3549 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
3550 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
3551 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
3552 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3
3553 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z}
3554 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
3555 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
3556 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
3557 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z}
3558 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
3559 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
3560 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
3561 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
3562 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
3563 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
3564 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
3565 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z}
3566 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
3567 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
3568 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
3569 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
3570 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
3571 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
3572 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
3573 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
3574 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
3575 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx)
3576 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
3577 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx)
3578 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx)
3579 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx)
3580 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx)
3581 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
3582 ; AVX512DQ-NEXT: vzeroupper
3583 ; AVX512DQ-NEXT: retq
3585 ; AVX512BW-LABEL: mask_replication_factor5_vf32:
3586 ; AVX512BW: # %bb.0:
3587 ; AVX512BW-NEXT: kmovd (%rdi), %k5
3588 ; AVX512BW-NEXT: kshiftrd $1, %k5, %k1
3589 ; AVX512BW-NEXT: movw $-3, %ax
3590 ; AVX512BW-NEXT: kmovd %eax, %k6
3591 ; AVX512BW-NEXT: kmovw (%rdi), %k2
3592 ; AVX512BW-NEXT: kandw %k6, %k2, %k3
3593 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
3594 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
3595 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3596 ; AVX512BW-NEXT: movw $-5, %ax
3597 ; AVX512BW-NEXT: kmovd %eax, %k4
3598 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3599 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3600 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
3601 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3602 ; AVX512BW-NEXT: movw $-9, %ax
3603 ; AVX512BW-NEXT: kmovd %eax, %k4
3604 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3605 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3606 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
3607 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3608 ; AVX512BW-NEXT: movw $-17, %ax
3609 ; AVX512BW-NEXT: kmovd %eax, %k4
3610 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3611 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3612 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k2
3613 ; AVX512BW-NEXT: korw %k2, %k3, %k2
3614 ; AVX512BW-NEXT: movw $-33, %ax
3615 ; AVX512BW-NEXT: kmovd %eax, %k3
3616 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3617 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
3618 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
3619 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
3620 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3621 ; AVX512BW-NEXT: movw $-65, %ax
3622 ; AVX512BW-NEXT: kmovd %eax, %k3
3623 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3624 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
3625 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
3626 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3627 ; AVX512BW-NEXT: movw $-129, %ax
3628 ; AVX512BW-NEXT: kmovd %eax, %k3
3629 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3630 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
3631 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
3632 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3633 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
3634 ; AVX512BW-NEXT: kmovd %eax, %k3
3635 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3636 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
3637 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
3638 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3639 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
3640 ; AVX512BW-NEXT: kmovd %eax, %k7
3641 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
3642 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3643 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
3644 ; AVX512BW-NEXT: korw %k1, %k2, %k1
3645 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
3646 ; AVX512BW-NEXT: kmovd %eax, %k2
3647 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3648 ; AVX512BW-NEXT: kandw %k2, %k1, %k3
3649 ; AVX512BW-NEXT: kshiftrd $2, %k5, %k1
3650 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
3651 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
3652 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3653 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
3654 ; AVX512BW-NEXT: kmovd %eax, %k4
3655 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3656 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3657 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4
3658 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3659 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
3660 ; AVX512BW-NEXT: kmovd %eax, %k4
3661 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3662 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3663 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k4
3664 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3665 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
3666 ; AVX512BW-NEXT: kmovd %eax, %k4
3667 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3668 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3669 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
3670 ; AVX512BW-NEXT: korw %k2, %k3, %k2
3671 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
3672 ; AVX512BW-NEXT: kmovd %eax, %k3
3673 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3674 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
3675 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
3676 ; AVX512BW-NEXT: korw %k1, %k2, %k1
3677 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
3678 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
3679 ; AVX512BW-NEXT: kshiftrd $3, %k5, %k2
3680 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3681 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
3682 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3683 ; AVX512BW-NEXT: korw %k2, %k1, %k1
3684 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
3685 ; AVX512BW-NEXT: kshiftrd $29, %k5, %k1
3686 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
3687 ; AVX512BW-NEXT: kshiftrd $28, %k5, %k1
3688 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3689 ; AVX512BW-NEXT: kandw %k6, %k1, %k3
3690 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3691 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
3692 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3693 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3694 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
3695 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
3696 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3697 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3698 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3699 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
3700 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3701 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3702 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3703 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k4
3704 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3705 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3706 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3707 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
3708 ; AVX512BW-NEXT: korw %k2, %k3, %k2
3709 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3710 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
3711 ; AVX512BW-NEXT: kshiftrd $30, %k5, %k3
3712 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
3713 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
3714 ; AVX512BW-NEXT: korw %k4, %k2, %k2
3715 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3716 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
3717 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
3718 ; AVX512BW-NEXT: korw %k4, %k2, %k2
3719 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3720 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
3721 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
3722 ; AVX512BW-NEXT: korw %k4, %k2, %k2
3723 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
3724 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4
3725 ; AVX512BW-NEXT: korw %k4, %k2, %k2
3726 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3727 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
3728 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
3729 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3730 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3731 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
3732 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k3
3733 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
3734 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
3735 ; AVX512BW-NEXT: korw %k7, %k2, %k2
3736 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3737 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
3738 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k7
3739 ; AVX512BW-NEXT: korw %k7, %k2, %k2
3740 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3741 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
3742 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k7
3743 ; AVX512BW-NEXT: korw %k7, %k2, %k2
3744 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3745 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
3746 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
3747 ; AVX512BW-NEXT: korw %k3, %k2, %k2
3748 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
3749 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
3750 ; AVX512BW-NEXT: korw %k4, %k2, %k2
3751 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z}
3752 ; AVX512BW-NEXT: kshiftrd $25, %k5, %k2
3753 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3754 ; AVX512BW-NEXT: kandw %k6, %k2, %k3
3755 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
3756 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3757 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
3758 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3759 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
3760 ; AVX512BW-NEXT: kshiftrd $26, %k5, %k7
3761 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
3762 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6
3763 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3764 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3765 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
3766 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
3767 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3768 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3769 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
3770 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
3771 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3772 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3773 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
3774 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
3775 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3776 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3777 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
3778 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
3779 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3780 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3781 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
3782 ; AVX512BW-NEXT: kshiftrd $27, %k5, %k6
3783 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
3784 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k7
3785 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3786 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3787 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
3788 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
3789 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3790 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
3791 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3792 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
3793 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3794 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
3795 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3796 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
3797 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3798 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3799 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6
3800 ; AVX512BW-NEXT: korw %k6, %k3, %k3
3801 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3802 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3803 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
3804 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
3805 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
3806 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3807 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3808 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
3809 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
3810 ; AVX512BW-NEXT: korw %k7, %k3, %k3
3811 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
3812 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3813 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1
3814 ; AVX512BW-NEXT: korw %k1, %k3, %k1
3815 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
3816 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
3817 ; AVX512BW-NEXT: korw %k6, %k1, %k1
3818 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
3819 ; AVX512BW-NEXT: kshiftrd $22, %k5, %k0
3820 ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
3821 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3822 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
3823 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
3824 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3825 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k7
3826 ; AVX512BW-NEXT: korw %k7, %k6, %k6
3827 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3828 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
3829 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k7
3830 ; AVX512BW-NEXT: korw %k7, %k6, %k6
3831 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
3832 ; AVX512BW-NEXT: kandw %k0, %k6, %k6
3833 ; AVX512BW-NEXT: kshiftrd $23, %k5, %k7
3834 ; AVX512BW-NEXT: kmovq %k5, %k0
3835 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
3836 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k5
3837 ; AVX512BW-NEXT: korw %k5, %k6, %k5
3838 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3839 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
3840 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
3841 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3842 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3843 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
3844 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
3845 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3846 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3847 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
3848 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
3849 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3850 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3851 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
3852 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
3853 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3854 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
3855 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
3856 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
3857 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
3858 ; AVX512BW-NEXT: korw %k7, %k5, %k5
3859 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3860 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
3861 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
3862 ; AVX512BW-NEXT: korw %k7, %k5, %k5
3863 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
3864 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
3865 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
3866 ; AVX512BW-NEXT: korw %k7, %k5, %k5
3867 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3868 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
3869 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
3870 ; AVX512BW-NEXT: korw %k7, %k5, %k5
3871 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3872 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
3873 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
3874 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3875 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3876 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
3877 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3878 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
3879 ; AVX512BW-NEXT: korw %k6, %k5, %k5
3880 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
3881 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
3882 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
3883 ; AVX512BW-NEXT: korw %k2, %k5, %k2
3884 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
3885 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
3886 ; AVX512BW-NEXT: korw %k7, %k2, %k2
3887 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z}
3888 ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
3889 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3890 ; AVX512BW-NEXT: kandw %k7, %k2, %k4
3891 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
3892 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
3893 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
3894 ; AVX512BW-NEXT: korw %k5, %k4, %k4
3895 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3896 ; AVX512BW-NEXT: kandw %k2, %k4, %k4
3897 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
3898 ; AVX512BW-NEXT: korw %k5, %k4, %k4
3899 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3900 ; AVX512BW-NEXT: kandw %k2, %k4, %k4
3901 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k5
3902 ; AVX512BW-NEXT: korw %k5, %k4, %k4
3903 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3904 ; AVX512BW-NEXT: kandw %k2, %k4, %k4
3905 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k5
3906 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
3907 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
3908 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3909 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3910 ; AVX512BW-NEXT: kandw %k2, %k4, %k4
3911 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
3912 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3913 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
3914 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
3915 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k6
3916 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3917 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
3918 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
3919 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
3920 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3921 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
3922 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
3923 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k5
3924 ; AVX512BW-NEXT: korw %k5, %k4, %k4
3925 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
3926 ; AVX512BW-NEXT: kshiftrd $21, %k0, %k5
3927 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
3928 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
3929 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3930 ; AVX512BW-NEXT: kandw %k3, %k4, %k4
3931 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
3932 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3933 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3934 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
3935 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
3936 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3937 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3938 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
3939 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
3940 ; AVX512BW-NEXT: korw %k6, %k4, %k4
3941 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3942 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
3943 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
3944 ; AVX512BW-NEXT: korw %k5, %k4, %k4
3945 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
3946 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
3947 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
3948 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
3949 ; AVX512BW-NEXT: korw %k1, %k4, %k1
3950 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
3951 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
3952 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
3953 ; AVX512BW-NEXT: korw %k3, %k1, %k1
3954 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z}
3955 ; AVX512BW-NEXT: kshiftrd $16, %k0, %k1
3956 ; AVX512BW-NEXT: kandw %k7, %k1, %k3
3957 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
3958 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4
3959 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3960 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
3961 ; AVX512BW-NEXT: kandw %k6, %k3, %k3
3962 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4
3963 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3964 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
3965 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
3966 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4
3967 ; AVX512BW-NEXT: korw %k4, %k3, %k3
3968 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
3969 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
3970 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k1
3971 ; AVX512BW-NEXT: korw %k1, %k3, %k1
3972 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3973 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k3
3974 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
3975 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k4
3976 ; AVX512BW-NEXT: korw %k4, %k1, %k1
3977 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3978 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3979 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
3980 ; AVX512BW-NEXT: korw %k4, %k1, %k1
3981 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3982 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3983 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
3984 ; AVX512BW-NEXT: korw %k4, %k1, %k1
3985 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3986 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3987 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
3988 ; AVX512BW-NEXT: korw %k4, %k1, %k1
3989 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3990 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3991 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
3992 ; AVX512BW-NEXT: korw %k3, %k1, %k1
3993 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
3994 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
3995 ; AVX512BW-NEXT: kshiftrd $18, %k0, %k3
3996 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
3997 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
3998 ; AVX512BW-NEXT: korw %k5, %k1, %k1
3999 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4000 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4001 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5
4002 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4003 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4004 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4005 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5
4006 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4007 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4008 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4009 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k4
4010 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4011 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4012 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4013 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
4014 ; AVX512BW-NEXT: korw %k3, %k1, %k1
4015 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
4016 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
4017 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4018 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4019 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
4020 ; AVX512BW-NEXT: kshiftrd $13, %k0, %k1
4021 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4022 ; AVX512BW-NEXT: kshiftrd $12, %k0, %k3
4023 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4024 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4025 ; AVX512BW-NEXT: kandw %k2, %k3, %k2
4026 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4
4027 ; AVX512BW-NEXT: korw %k4, %k2, %k2
4028 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
4029 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4
4030 ; AVX512BW-NEXT: korw %k4, %k2, %k2
4031 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
4032 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4
4033 ; AVX512BW-NEXT: korw %k4, %k2, %k2
4034 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4035 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
4036 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k4
4037 ; AVX512BW-NEXT: korw %k4, %k2, %k2
4038 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4039 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
4040 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
4041 ; AVX512BW-NEXT: korw %k1, %k2, %k1
4042 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4043 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4044 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k2
4045 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
4046 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k4
4047 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4048 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4049 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4050 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k4
4051 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4052 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4053 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4054 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k4
4055 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4056 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4057 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4058 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k4
4059 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4060 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4061 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4062 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k2
4063 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4064 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4065 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4066 ; AVX512BW-NEXT: kshiftrd $15, %k0, %k2
4067 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k4
4068 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5
4069 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4070 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4071 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4072 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5
4073 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4074 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
4075 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
4076 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k5
4077 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4078 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
4079 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
4080 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
4081 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4082 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
4083 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
4084 ; AVX512BW-NEXT: korw %k4, %k1, %k1
4085 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z}
4086 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k2
4087 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4088 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4089 ; AVX512BW-NEXT: kandw %k1, %k2, %k4
4090 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
4091 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4092 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
4093 ; AVX512BW-NEXT: korw %k5, %k4, %k4
4094 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
4095 ; AVX512BW-NEXT: kshiftrd $10, %k0, %k5
4096 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
4097 ; AVX512BW-NEXT: kshiftrw $13, %k5, %k6
4098 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4099 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4100 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4101 ; AVX512BW-NEXT: kshiftrw $12, %k5, %k6
4102 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4103 ; AVX512BW-NEXT: kandw %k7, %k4, %k4
4104 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
4105 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4106 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4107 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4108 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
4109 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4110 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4111 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4112 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k5
4113 ; AVX512BW-NEXT: korw %k5, %k4, %k4
4114 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4115 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4116 ; AVX512BW-NEXT: kshiftrd $11, %k0, %k5
4117 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
4118 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
4119 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4120 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4121 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4122 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
4123 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4124 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4125 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4126 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
4127 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4128 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4129 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4130 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
4131 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4132 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4133 ; AVX512BW-NEXT: kandw %k2, %k4, %k4
4134 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
4135 ; AVX512BW-NEXT: korw %k5, %k4, %k4
4136 ; AVX512BW-NEXT: kandw %k3, %k4, %k4
4137 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
4138 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k5
4139 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
4140 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4141 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4142 ; AVX512BW-NEXT: kandw %k1, %k4, %k4
4143 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k6
4144 ; AVX512BW-NEXT: korw %k6, %k4, %k4
4145 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4146 ; AVX512BW-NEXT: kandw %k3, %k4, %k4
4147 ; AVX512BW-NEXT: kshiftlw $14, %k7, %k3
4148 ; AVX512BW-NEXT: korw %k3, %k4, %k3
4149 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
4150 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
4151 ; AVX512BW-NEXT: korw %k5, %k3, %k3
4152 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z}
4153 ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
4154 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
4155 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4156 ; AVX512BW-NEXT: kandw %k3, %k4, %k5
4157 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
4158 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4159 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6
4160 ; AVX512BW-NEXT: korw %k6, %k5, %k5
4161 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4162 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
4163 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
4164 ; AVX512BW-NEXT: korw %k6, %k5, %k5
4165 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4166 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
4167 ; AVX512BW-NEXT: kshiftrd $7, %k0, %k6
4168 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
4169 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
4170 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4171 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4172 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
4173 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
4174 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4175 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4176 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
4177 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
4178 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4179 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4180 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
4181 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
4182 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4183 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4184 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
4185 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
4186 ; AVX512BW-NEXT: korw %k6, %k5, %k5
4187 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4188 ; AVX512BW-NEXT: kandw %k6, %k5, %k5
4189 ; AVX512BW-NEXT: kshiftrd $8, %k0, %k6
4190 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
4191 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
4192 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4193 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4194 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
4195 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
4196 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4197 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4198 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
4199 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
4200 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4201 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
4202 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
4203 ; AVX512BW-NEXT: korw %k7, %k5, %k5
4204 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4205 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
4206 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
4207 ; AVX512BW-NEXT: korw %k6, %k5, %k5
4208 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
4209 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4210 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
4211 ; AVX512BW-NEXT: korw %k6, %k5, %k5
4212 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4213 ; AVX512BW-NEXT: kandw %k6, %k5, %k5
4214 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
4215 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
4216 ; AVX512BW-NEXT: korw %k2, %k5, %k2
4217 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
4218 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
4219 ; AVX512BW-NEXT: korw %k1, %k2, %k1
4220 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z}
4221 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
4222 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4223 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4224 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
4225 ; AVX512BW-NEXT: kshiftrw $14, %k5, %k2
4226 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4227 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4228 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4229 ; AVX512BW-NEXT: kshiftrw $13, %k5, %k2
4230 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4231 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4232 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4233 ; AVX512BW-NEXT: kshiftrw $12, %k5, %k2
4234 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4235 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4236 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4237 ; AVX512BW-NEXT: kshiftrd $4, %k0, %k2
4238 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
4239 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
4240 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4241 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
4242 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
4243 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k5
4244 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4245 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
4246 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
4247 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4248 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4249 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
4250 ; AVX512BW-NEXT: korw %k5, %k1, %k1
4251 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4252 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4253 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
4254 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4255 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4256 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4257 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k0
4258 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
4259 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
4260 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4261 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4262 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4263 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
4264 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4265 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4266 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4267 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
4268 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4269 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
4270 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
4271 ; AVX512BW-NEXT: korw %k2, %k1, %k1
4272 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4273 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4274 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
4275 ; AVX512BW-NEXT: korw %k0, %k1, %k0
4276 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4277 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
4278 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
4279 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4280 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4281 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4282 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4283 ; AVX512BW-NEXT: korw %k1, %k0, %k1
4284 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z}
4285 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
4286 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx)
4287 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx)
4288 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx)
4289 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
4290 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx)
4291 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx)
4292 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx)
4293 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx)
4294 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
4295 ; AVX512BW-NEXT: vzeroupper
4296 ; AVX512BW-NEXT: retq
4297 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
4298 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
4299 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31>
4300 %data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison)
4301 store <160 x i32> %data, ptr %out.vec, align 64
4305 define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
4306 ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64:
4307 ; AVX512F-ONLY: # %bb.0:
4308 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
4309 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
4310 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
4311 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1
4312 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
4313 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
4314 ; AVX512F-ONLY-NEXT: movw $1, %ax
4315 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
4316 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
4317 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
4318 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1
4319 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
4320 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1
4321 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
4322 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1
4323 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
4324 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
4325 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1
4326 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
4327 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2
4328 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
4329 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10
4330 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
4331 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12
4332 ; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4
4333 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13
4334 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14
4335 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15
4336 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16
4337 ; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5
4338 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17
4339 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18
4340 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19
4341 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3
4342 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6
4343 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7
4344 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8
4345 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9
4346 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0
4347 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
4348 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
4349 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
4350 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
4351 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
4352 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
4353 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
4354 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
4355 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
4356 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
4357 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
4358 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
4359 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
4360 ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
4361 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
4362 ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
4363 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
4364 ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
4365 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
4366 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
4367 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
4368 ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
4369 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
4370 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
4371 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
4372 ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
4373 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
4374 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
4375 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
4376 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
4377 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
4378 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
4379 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
4380 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
4381 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
4382 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
4383 ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
4384 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
4385 ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
4386 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx)
4387 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx)
4388 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx)
4389 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx)
4390 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx)
4391 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx)
4392 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx)
4393 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx)
4394 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx)
4395 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx)
4396 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx)
4397 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx)
4398 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx)
4399 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx)
4400 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx)
4401 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx)
4402 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
4403 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx)
4404 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
4405 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx)
4406 ; AVX512F-ONLY-NEXT: vzeroupper
4407 ; AVX512F-ONLY-NEXT: retq
4409 ; AVX512DQ-LABEL: mask_replication_factor5_vf64:
4410 ; AVX512DQ: # %bb.0:
4411 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
4412 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
4413 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3]
4414 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1
4415 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
4416 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
4417 ; AVX512DQ-NEXT: movw $1, %ax
4418 ; AVX512DQ-NEXT: kmovw %eax, %k1
4419 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
4420 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
4421 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
4422 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
4423 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5
4424 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
4425 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6
4426 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
4427 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15]
4428 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1
4429 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12]
4430 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2
4431 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9]
4432 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10
4433 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6]
4434 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12
4435 ; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4
4436 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13
4437 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14
4438 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15
4439 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16
4440 ; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5
4441 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17
4442 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18
4443 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19
4444 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3
4445 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6
4446 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7
4447 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8
4448 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9
4449 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0
4450 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
4451 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
4452 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
4453 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
4454 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
4455 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
4456 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
4457 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
4458 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
4459 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
4460 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
4461 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
4462 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
4463 ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
4464 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z}
4465 ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
4466 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
4467 ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
4468 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z}
4469 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
4470 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z}
4471 ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
4472 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z}
4473 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
4474 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
4475 ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
4476 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z}
4477 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
4478 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z}
4479 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
4480 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z}
4481 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
4482 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z}
4483 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
4484 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z}
4485 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
4486 ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z}
4487 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
4488 ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z}
4489 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx)
4490 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx)
4491 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx)
4492 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx)
4493 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx)
4494 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx)
4495 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx)
4496 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx)
4497 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx)
4498 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx)
4499 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx)
4500 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx)
4501 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx)
4502 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx)
4503 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx)
4504 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx)
4505 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
4506 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx)
4507 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
4508 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx)
4509 ; AVX512DQ-NEXT: vzeroupper
4510 ; AVX512DQ-NEXT: retq
4512 ; AVX512BW-LABEL: mask_replication_factor5_vf64:
4513 ; AVX512BW: # %bb.0:
4514 ; AVX512BW-NEXT: kmovq (%rdi), %k5
4515 ; AVX512BW-NEXT: kshiftrq $1, %k5, %k0
4516 ; AVX512BW-NEXT: movw $-3, %ax
4517 ; AVX512BW-NEXT: kmovd %eax, %k1
4518 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4519 ; AVX512BW-NEXT: kmovw (%rdi), %k2
4520 ; AVX512BW-NEXT: kandw %k1, %k2, %k3
4521 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
4522 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
4523 ; AVX512BW-NEXT: korw %k4, %k3, %k3
4524 ; AVX512BW-NEXT: movw $-5, %ax
4525 ; AVX512BW-NEXT: kmovd %eax, %k1
4526 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4527 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
4528 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
4529 ; AVX512BW-NEXT: korw %k4, %k3, %k3
4530 ; AVX512BW-NEXT: movw $-9, %ax
4531 ; AVX512BW-NEXT: kmovd %eax, %k1
4532 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4533 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
4534 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
4535 ; AVX512BW-NEXT: korw %k4, %k3, %k3
4536 ; AVX512BW-NEXT: movw $-17, %ax
4537 ; AVX512BW-NEXT: kmovd %eax, %k1
4538 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4539 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
4540 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k2
4541 ; AVX512BW-NEXT: korw %k2, %k3, %k2
4542 ; AVX512BW-NEXT: movw $-33, %ax
4543 ; AVX512BW-NEXT: kmovd %eax, %k1
4544 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4545 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
4546 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
4547 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k3
4548 ; AVX512BW-NEXT: korw %k3, %k2, %k2
4549 ; AVX512BW-NEXT: movw $-65, %ax
4550 ; AVX512BW-NEXT: kmovd %eax, %k1
4551 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4552 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
4553 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k3
4554 ; AVX512BW-NEXT: korw %k3, %k2, %k2
4555 ; AVX512BW-NEXT: movw $-129, %ax
4556 ; AVX512BW-NEXT: kmovd %eax, %k1
4557 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4558 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
4559 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k3
4560 ; AVX512BW-NEXT: korw %k3, %k2, %k2
4561 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
4562 ; AVX512BW-NEXT: kmovd %eax, %k1
4563 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4564 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
4565 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k3
4566 ; AVX512BW-NEXT: korw %k3, %k2, %k2
4567 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
4568 ; AVX512BW-NEXT: kmovd %eax, %k1
4569 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4570 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
4571 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
4572 ; AVX512BW-NEXT: korw %k0, %k2, %k0
4573 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
4574 ; AVX512BW-NEXT: kmovd %eax, %k1
4575 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4576 ; AVX512BW-NEXT: kandw %k1, %k0, %k3
4577 ; AVX512BW-NEXT: kshiftrq $2, %k5, %k0
4578 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k2
4579 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
4580 ; AVX512BW-NEXT: korw %k4, %k3, %k3
4581 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
4582 ; AVX512BW-NEXT: kmovd %eax, %k1
4583 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4584 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
4585 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4
4586 ; AVX512BW-NEXT: korw %k4, %k3, %k3
4587 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
4588 ; AVX512BW-NEXT: kmovd %eax, %k1
4589 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4590 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
4591 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k7
4592 ; AVX512BW-NEXT: korw %k7, %k3, %k7
4593 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
4594 ; AVX512BW-NEXT: kmovd %eax, %k1
4595 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4596 ; AVX512BW-NEXT: kandw %k1, %k7, %k7
4597 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
4598 ; AVX512BW-NEXT: korw %k2, %k7, %k7
4599 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
4600 ; AVX512BW-NEXT: kmovd %eax, %k6
4601 ; AVX512BW-NEXT: kandw %k6, %k7, %k7
4602 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
4603 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
4604 ; AVX512BW-NEXT: korw %k0, %k7, %k0
4605 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4606 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k1
4607 ; AVX512BW-NEXT: kshiftrq $3, %k5, %k7
4608 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k0
4609 ; AVX512BW-NEXT: korw %k0, %k1, %k1
4610 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
4611 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4612 ; AVX512BW-NEXT: kandw %k2, %k7, %k1
4613 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k7
4614 ; AVX512BW-NEXT: korw %k7, %k1, %k1
4615 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4616 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4617 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k7
4618 ; AVX512BW-NEXT: korw %k7, %k1, %k1
4619 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4620 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
4621 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
4622 ; AVX512BW-NEXT: korw %k0, %k1, %k0
4623 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4624 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4625 ; AVX512BW-NEXT: kshiftrq $4, %k5, %k1
4626 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4627 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k7
4628 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4629 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4630 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4631 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k7
4632 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4633 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4634 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4635 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k7
4636 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4637 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4638 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4639 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k7
4640 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4641 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4642 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4643 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
4644 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4645 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4646 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4647 ; AVX512BW-NEXT: kshiftrq $5, %k5, %k1
4648 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4649 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
4650 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4651 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4652 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4653 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
4654 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4655 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4656 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4657 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
4658 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4659 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4660 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4661 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
4662 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4663 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4664 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4665 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
4666 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4667 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4668 ; AVX512BW-NEXT: kshiftrq $6, %k5, %k1
4669 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
4670 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4671 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4672 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4673 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
4674 ; AVX512BW-NEXT: korw %k7, %k0, %k6
4675 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z}
4676 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
4677 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1
4678 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4679 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4680 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4681 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1
4682 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4683 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4684 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4685 ; AVX512BW-NEXT: kshiftrq $7, %k5, %k1
4686 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4687 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
4688 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4689 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4690 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4691 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
4692 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4693 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4694 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
4695 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4696 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4697 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4698 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
4699 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4700 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4701 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4702 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
4703 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4704 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4705 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4706 ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1
4707 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4708 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
4709 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4710 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4711 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4712 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
4713 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4714 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4715 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4716 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
4717 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4718 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4719 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
4720 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4721 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4722 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4723 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
4724 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4725 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4726 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4727 ; AVX512BW-NEXT: kshiftrq $9, %k5, %k1
4728 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4729 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
4730 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4731 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4732 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4733 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
4734 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4735 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4736 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4737 ; AVX512BW-NEXT: korw %k6, %k0, %k7
4738 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z}
4739 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
4740 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
4741 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
4742 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4743 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4744 ; AVX512BW-NEXT: kshiftrq $10, %k5, %k1
4745 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4746 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
4747 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4748 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4749 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4750 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
4751 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4752 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4753 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4754 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
4755 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4756 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4757 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4758 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
4759 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4760 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4761 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
4762 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4763 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4764 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4765 ; AVX512BW-NEXT: kshiftrq $11, %k5, %k1
4766 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4767 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
4768 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4769 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4770 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4771 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
4772 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4773 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4774 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4775 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
4776 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4777 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4778 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4779 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
4780 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4781 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4782 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4783 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
4784 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4785 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4786 ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1
4787 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4788 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
4789 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4790 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4791 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4792 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
4793 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4794 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4795 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4796 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
4797 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4798 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4799 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4800 ; AVX512BW-NEXT: korw %k6, %k0, %k6
4801 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z}
4802 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
4803 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
4804 ; AVX512BW-NEXT: kshiftrq $13, %k5, %k1
4805 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4806 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
4807 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4808 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4809 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4810 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
4811 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4812 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4813 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
4814 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4815 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4816 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4817 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
4818 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4819 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4820 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4821 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
4822 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4823 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4824 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4825 ; AVX512BW-NEXT: kshiftrq $14, %k5, %k1
4826 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4827 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
4828 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4829 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4830 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4831 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
4832 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4833 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4834 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
4835 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4836 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4837 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4838 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
4839 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4840 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4841 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4842 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
4843 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4844 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4845 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4846 ; AVX512BW-NEXT: kshiftrq $15, %k5, %k1
4847 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4848 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
4849 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4850 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4851 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4852 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
4853 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4854 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4855 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
4856 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4857 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4858 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4859 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
4860 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4861 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4862 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4863 ; AVX512BW-NEXT: korw %k6, %k0, %k1
4864 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
4865 ; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
4866 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4867 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
4868 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
4869 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
4870 ; AVX512BW-NEXT: korw %k6, %k1, %k1
4871 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4872 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
4873 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
4874 ; AVX512BW-NEXT: korw %k6, %k1, %k1
4875 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4876 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
4877 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
4878 ; AVX512BW-NEXT: korw %k6, %k1, %k1
4879 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
4880 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
4881 ; AVX512BW-NEXT: korw %k0, %k1, %k0
4882 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4883 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4884 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1
4885 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4886 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
4887 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4888 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4889 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4890 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
4891 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4892 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4893 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4894 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
4895 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4896 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
4897 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
4898 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
4899 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4900 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4901 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
4902 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4903 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4904 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4905 ; AVX512BW-NEXT: kshiftrq $18, %k5, %k1
4906 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4907 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
4908 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4909 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4910 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4911 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
4912 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4913 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4914 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4915 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
4916 ; AVX512BW-NEXT: korw %k7, %k0, %k0
4917 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
4918 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
4919 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
4920 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4921 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4922 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
4923 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4924 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4925 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4926 ; AVX512BW-NEXT: kshiftrq $19, %k5, %k1
4927 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4928 ; AVX512BW-NEXT: korw %k6, %k0, %k7
4929 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z}
4930 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
4931 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
4932 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
4933 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4934 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4935 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4936 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
4937 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4938 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4939 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4940 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
4941 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4942 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4943 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4944 ; AVX512BW-NEXT: kshiftrq $20, %k5, %k1
4945 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4946 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
4947 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4948 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4949 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4950 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
4951 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4952 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4953 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4954 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
4955 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4956 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4957 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
4958 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4959 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4960 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4961 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
4962 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4963 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4964 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4965 ; AVX512BW-NEXT: kshiftrq $21, %k5, %k1
4966 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
4967 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
4968 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4969 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4970 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
4971 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4972 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
4973 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
4974 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
4975 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4976 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
4977 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
4978 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
4979 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4980 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4981 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
4982 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
4983 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4984 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
4985 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
4986 ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1
4987 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
4988 ; AVX512BW-NEXT: korw %k6, %k0, %k0
4989 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
4990 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
4991 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
4992 ; AVX512BW-NEXT: korw %k6, %k0, %k7
4993 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
4994 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
4995 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
4996 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
4997 ; AVX512BW-NEXT: korw %k1, %k0, %k0
4998 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
4999 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5000 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
5001 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5002 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5003 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5004 ; AVX512BW-NEXT: kshiftrq $23, %k5, %k1
5005 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5006 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5007 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5008 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5009 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5010 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5011 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5012 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5013 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5014 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5015 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5016 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5017 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5018 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5019 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5020 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5021 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5022 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
5023 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5024 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5025 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5026 ; AVX512BW-NEXT: kshiftrq $24, %k5, %k1
5027 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5028 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5029 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5030 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5031 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5032 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5033 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5034 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5035 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5036 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5037 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5038 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5039 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5040 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5041 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5042 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
5043 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5044 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5045 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5046 ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1
5047 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5048 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5049 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5050 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5051 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5052 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5053 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5054 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5055 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5056 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5057 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
5058 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5059 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
5060 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5061 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5062 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5063 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
5064 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5065 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5066 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5067 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5068 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5069 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5070 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5071 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5072 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5073 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5074 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5075 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5076 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5077 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5078 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5079 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5080 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5081 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
5082 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5083 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5084 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5085 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
5086 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5087 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5088 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5089 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5090 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5091 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5092 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5093 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5094 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5095 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5096 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5097 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5098 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5099 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5100 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5101 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5102 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5103 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
5104 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5105 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5106 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5107 ; AVX512BW-NEXT: kshiftrq $28, %k5, %k1
5108 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5109 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5110 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5111 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
5112 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
5113 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5114 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5115 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
5116 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
5117 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5118 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5119 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5120 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5121 ; AVX512BW-NEXT: korw %k6, %k0, %k6
5122 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z}
5123 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
5124 ; AVX512BW-NEXT: kshiftrq $29, %k5, %k1
5125 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5126 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
5127 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5128 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5129 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5130 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5131 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5132 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5133 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5134 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5135 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5136 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5137 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5138 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5139 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5140 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5141 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
5142 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5143 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5144 ; AVX512BW-NEXT: kshiftrq $30, %k5, %k1
5145 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5146 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5147 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5148 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5149 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5150 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5151 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5152 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5153 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5154 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5155 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5156 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5157 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5158 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5159 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5160 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5161 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5162 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
5163 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5164 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5165 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5166 ; AVX512BW-NEXT: kshiftrq $31, %k5, %k1
5167 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5168 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
5169 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5170 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5171 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5172 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5173 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5174 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5175 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5176 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5177 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5178 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5179 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5180 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
5181 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5182 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5183 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5184 ; AVX512BW-NEXT: korw %k6, %k0, %k1
5185 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
5186 ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0
5187 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5188 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
5189 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
5190 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
5191 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5192 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
5193 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
5194 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5195 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5196 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
5197 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
5198 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5199 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5200 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
5201 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
5202 ; AVX512BW-NEXT: korw %k0, %k1, %k0
5203 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5204 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5205 ; AVX512BW-NEXT: kshiftrq $33, %k5, %k1
5206 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5207 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5208 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5209 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5210 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5211 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5212 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5213 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5214 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5215 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5216 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5217 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5218 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5219 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5220 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5221 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5222 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
5223 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5224 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5225 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5226 ; AVX512BW-NEXT: kshiftrq $34, %k5, %k1
5227 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5228 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
5229 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5230 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5231 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5232 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
5233 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5234 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5235 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5236 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5237 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5238 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5239 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5240 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
5241 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5242 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5243 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5244 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
5245 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5246 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5247 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5248 ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1
5249 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5250 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5251 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
5252 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5253 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5254 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5255 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5256 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5257 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5258 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
5259 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5260 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5261 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
5262 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5263 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5264 ; AVX512BW-NEXT: kshiftrq $36, %k5, %k1
5265 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5266 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5267 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5268 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5269 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5270 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5271 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5272 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5273 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5274 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5275 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5276 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5277 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5278 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5279 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5280 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5281 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5282 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
5283 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5284 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5285 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5286 ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1
5287 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5288 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5289 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5290 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5291 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5292 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5293 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5294 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5295 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5296 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5297 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5298 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5299 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5300 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
5301 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5302 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5303 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
5304 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5305 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5306 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5307 ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1
5308 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
5309 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5310 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5311 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5312 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5313 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5314 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z}
5315 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5316 ; AVX512BW-NEXT: kandw %k3, %k1, %k0
5317 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5318 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5319 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5320 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5321 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
5322 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5323 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5324 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5325 ; AVX512BW-NEXT: kshiftrq $39, %k5, %k1
5326 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5327 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5328 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5329 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5330 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5331 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5332 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5333 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5334 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5335 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5336 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5337 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5338 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5339 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5340 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5341 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
5342 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5343 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5344 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5345 ; AVX512BW-NEXT: kshiftrq $40, %k5, %k1
5346 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5347 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5348 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5349 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5350 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5351 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5352 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5353 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5354 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5355 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5356 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5357 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5358 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5359 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5360 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5361 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5362 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5363 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
5364 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5365 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5366 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5367 ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
5368 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5369 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5370 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5371 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
5372 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
5373 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5374 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5375 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5376 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5377 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5378 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
5379 ; AVX512BW-NEXT: kandw %k3, %k1, %k0
5380 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5381 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5382 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5383 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5384 ; AVX512BW-NEXT: kshiftrq $42, %k5, %k1
5385 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5386 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5387 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5388 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5389 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5390 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5391 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5392 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5393 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5394 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5395 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5396 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5397 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5398 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5399 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5400 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5401 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5402 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
5403 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5404 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5405 ; AVX512BW-NEXT: kshiftrq $43, %k5, %k1
5406 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5407 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5408 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5409 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5410 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5411 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5412 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5413 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5414 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5415 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5416 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5417 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5418 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5419 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5420 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5421 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5422 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
5423 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5424 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5425 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5426 ; AVX512BW-NEXT: kshiftrq $44, %k5, %k1
5427 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5428 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5429 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5430 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5431 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5432 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5433 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5434 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5435 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5436 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5437 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5438 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5439 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5440 ; AVX512BW-NEXT: korw %k6, %k0, %k6
5441 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z}
5442 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5443 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5444 ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1
5445 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5446 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
5447 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5448 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5449 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5450 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5451 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5452 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5453 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5454 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5455 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5456 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5457 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5458 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5459 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5460 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5461 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5462 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
5463 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5464 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5465 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5466 ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1
5467 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5468 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5469 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5470 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5471 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5472 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5473 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5474 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5475 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5476 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5477 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5478 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5479 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5480 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5481 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5482 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
5483 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5484 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5485 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5486 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1
5487 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5488 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
5489 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5490 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5491 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5492 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5493 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5494 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
5495 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
5496 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5497 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5498 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5499 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
5500 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5501 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5502 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5503 ; AVX512BW-NEXT: korw %k6, %k0, %k1
5504 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
5505 ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
5506 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5507 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
5508 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
5509 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
5510 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5511 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5512 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
5513 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
5514 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5515 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5516 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
5517 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
5518 ; AVX512BW-NEXT: korw %k6, %k1, %k1
5519 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5520 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
5521 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
5522 ; AVX512BW-NEXT: korw %k0, %k1, %k0
5523 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5524 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5525 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
5526 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5527 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5528 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5529 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5530 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5531 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5532 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5533 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5534 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5535 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5536 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5537 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5538 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5539 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5540 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5541 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5542 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5543 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
5544 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5545 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5546 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5547 ; AVX512BW-NEXT: kshiftrq $50, %k5, %k1
5548 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5549 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
5550 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5551 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5552 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
5553 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5554 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5555 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5556 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5557 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5558 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5559 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
5560 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5561 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5562 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5563 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
5564 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5565 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5566 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5567 ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1
5568 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5569 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5570 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
5571 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5572 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5573 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5574 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5575 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5576 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
5577 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5578 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5579 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5580 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
5581 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5582 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5583 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5584 ; AVX512BW-NEXT: kshiftrq $52, %k5, %k1
5585 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5586 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5587 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5588 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5589 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5590 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5591 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5592 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5593 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5594 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5595 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5596 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5597 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5598 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5599 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5600 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5601 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5602 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
5603 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5604 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5605 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5606 ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1
5607 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5608 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5609 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5610 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5611 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5612 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5613 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5614 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5615 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5616 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5617 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5618 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5619 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5620 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
5621 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5622 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5623 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5624 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
5625 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5626 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5627 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5628 ; AVX512BW-NEXT: kshiftrq $54, %k5, %k1
5629 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
5630 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5631 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5632 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5633 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5634 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5635 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
5636 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5637 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5638 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5639 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5640 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5641 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5642 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
5643 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5644 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5645 ; AVX512BW-NEXT: kshiftrq $55, %k5, %k1
5646 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5647 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5648 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5649 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5650 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5651 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5652 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5653 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5654 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5655 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5656 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5657 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5658 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5659 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5660 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5661 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5662 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5663 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
5664 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5665 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5666 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5667 ; AVX512BW-NEXT: kshiftrq $56, %k5, %k1
5668 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5669 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5670 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5671 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5672 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5673 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5674 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5675 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5676 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5677 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5678 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5679 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5680 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5681 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5682 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5683 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5684 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
5685 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5686 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5687 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
5688 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5689 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5690 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5691 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5692 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5693 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5694 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5695 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5696 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5697 ; AVX512BW-NEXT: korw %k6, %k0, %k7
5698 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
5699 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5700 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5701 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
5702 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5703 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5704 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5705 ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1
5706 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5707 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5708 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5709 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5710 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5711 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5712 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5713 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5714 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5715 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5716 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
5717 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5718 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
5719 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5720 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5721 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5722 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
5723 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5724 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5725 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5726 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1
5727 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5728 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5729 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5730 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5731 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5732 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5733 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5734 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5735 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5736 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5737 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5738 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5739 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5740 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
5741 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5742 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
5743 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
5744 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
5745 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5746 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5747 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1
5748 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
5749 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
5750 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5751 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
5752 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5753 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
5754 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5755 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5756 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
5757 ; AVX512BW-NEXT: korw %k7, %k0, %k0
5758 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5759 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5760 ; AVX512BW-NEXT: korw %k6, %k0, %k6
5761 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z}
5762 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
5763 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
5764 ; AVX512BW-NEXT: kshiftrq $61, %k5, %k1
5765 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5766 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
5767 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5768 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5769 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5770 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
5771 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5772 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5773 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5774 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
5775 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5776 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5777 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5778 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
5779 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5780 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
5781 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
5782 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5783 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5784 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5785 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k1
5786 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
5787 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
5788 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5789 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5790 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5791 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
5792 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5793 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5794 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5795 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
5796 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5797 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5798 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5799 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
5800 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5801 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5802 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5803 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
5804 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
5805 ; AVX512BW-NEXT: korw %k1, %k0, %k0
5806 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
5807 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
5808 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
5809 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
5810 ; AVX512BW-NEXT: korw %k6, %k0, %k0
5811 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5812 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5813 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k4
5814 ; AVX512BW-NEXT: korw %k4, %k0, %k0
5815 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
5816 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k3
5817 ; AVX512BW-NEXT: korw %k3, %k0, %k0
5818 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
5819 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
5820 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
5821 ; AVX512BW-NEXT: korw %k2, %k0, %k0
5822 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
5823 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
5824 ; AVX512BW-NEXT: korw %k1, %k0, %k1
5825 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
5826 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
5827 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
5828 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
5829 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
5830 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
5831 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
5832 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
5833 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
5834 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
5835 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
5836 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
5837 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
5838 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
5839 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
5840 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
5841 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
5842 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
5843 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
5844 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
5845 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
5846 ; AVX512BW-NEXT: vzeroupper
5847 ; AVX512BW-NEXT: retq
5848 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
5849 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63>
5850 %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison)
5851 store <320 x i32> %data, ptr %out.vec, align 64
5855 define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
5856 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2:
5857 ; AVX512F-ONLY: # %bb.0:
5858 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
5859 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
5860 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
5861 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
5862 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
5863 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
5864 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
5865 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
5866 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
5867 ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
5868 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5869 ; AVX512F-ONLY-NEXT: vzeroupper
5870 ; AVX512F-ONLY-NEXT: retq
5872 ; AVX512DQ-LABEL: mask_replication_factor6_vf2:
5873 ; AVX512DQ: # %bb.0:
5874 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
5875 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
5876 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
5877 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
5878 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
5879 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
5880 ; AVX512DQ-NEXT: kmovw %eax, %k1
5881 ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
5882 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
5883 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
5884 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
5885 ; AVX512DQ-NEXT: vzeroupper
5886 ; AVX512DQ-NEXT: retq
5888 ; AVX512BW-LABEL: mask_replication_factor6_vf2:
5889 ; AVX512BW: # %bb.0:
5890 ; AVX512BW-NEXT: kmovw (%rdi), %k1
5891 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
5892 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0]
5893 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
5894 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
5895 ; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF
5896 ; AVX512BW-NEXT: kmovd %eax, %k1
5897 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
5898 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
5899 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
5900 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
5901 ; AVX512BW-NEXT: vzeroupper
5902 ; AVX512BW-NEXT: retq
5903 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
5904 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
5905 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
5906 %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison)
5907 %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
5908 store <12 x i32> %data, ptr %out.vec, align 64
5912 define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
5913 ; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4:
5914 ; AVX512F-SLOW: # %bb.0:
5915 ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
5916 ; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
5917 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
5918 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
5919 ; AVX512F-SLOW-NEXT: vpslld $31, %zmm1, %zmm1
5920 ; AVX512F-SLOW-NEXT: movw $255, %ax
5921 ; AVX512F-SLOW-NEXT: kmovw %eax, %k1
5922 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
5923 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
5924 ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
5925 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2
5926 ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
5927 ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
5928 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
5929 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
5930 ; AVX512F-SLOW-NEXT: vzeroupper
5931 ; AVX512F-SLOW-NEXT: retq
5933 ; AVX512F-FAST-LABEL: mask_replication_factor6_vf4:
5934 ; AVX512F-FAST: # %bb.0:
5935 ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
5936 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
5937 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
5938 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
5939 ; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1
5940 ; AVX512F-FAST-NEXT: movw $255, %ax
5941 ; AVX512F-FAST-NEXT: kmovw %eax, %k1
5942 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
5943 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
5944 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
5945 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2
5946 ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
5947 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
5948 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
5949 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
5950 ; AVX512F-FAST-NEXT: vzeroupper
5951 ; AVX512F-FAST-NEXT: retq
5953 ; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4:
5954 ; AVX512DQ-SLOW: # %bb.0:
5955 ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
5956 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
5957 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
5958 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
5959 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
5960 ; AVX512DQ-SLOW-NEXT: movw $255, %ax
5961 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
5962 ; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
5963 ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
5964 ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
5965 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2
5966 ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
5967 ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
5968 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
5969 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
5970 ; AVX512DQ-SLOW-NEXT: vzeroupper
5971 ; AVX512DQ-SLOW-NEXT: retq
5973 ; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4:
5974 ; AVX512DQ-FAST: # %bb.0:
5975 ; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
5976 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
5977 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
5978 ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
5979 ; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
5980 ; AVX512DQ-FAST-NEXT: movw $255, %ax
5981 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
5982 ; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
5983 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
5984 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
5985 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2
5986 ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
5987 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
5988 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
5989 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
5990 ; AVX512DQ-FAST-NEXT: vzeroupper
5991 ; AVX512DQ-FAST-NEXT: retq
5993 ; AVX512BW-LABEL: mask_replication_factor6_vf4:
5994 ; AVX512BW: # %bb.0:
5995 ; AVX512BW-NEXT: kmovd (%rdi), %k0
5996 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
5997 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0]
5998 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
5999 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
6000 ; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF
6001 ; AVX512BW-NEXT: kmovd %eax, %k1
6002 ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
6003 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
6004 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
6005 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
6006 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
6007 ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx)
6008 ; AVX512BW-NEXT: vzeroupper
6009 ; AVX512BW-NEXT: retq
6010 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
6011 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6012 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
6013 %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison)
6014 %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
6015 store <24 x i32> %data, ptr %out.vec, align 64
6019 define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
6020 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8:
6021 ; AVX512F-ONLY: # %bb.0:
6022 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
6023 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
6024 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6025 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6026 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
6027 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
6028 ; AVX512F-ONLY-NEXT: movw $1, %ax
6029 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
6030 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6031 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
6032 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6033 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6034 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
6035 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6036 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
6037 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
6038 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
6039 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
6040 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
6041 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
6042 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx)
6043 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
6044 ; AVX512F-ONLY-NEXT: vzeroupper
6045 ; AVX512F-ONLY-NEXT: retq
6047 ; AVX512DQ-LABEL: mask_replication_factor6_vf8:
6048 ; AVX512DQ: # %bb.0:
6049 ; AVX512DQ-NEXT: kmovb (%rdi), %k0
6050 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
6051 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6052 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6053 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
6054 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
6055 ; AVX512DQ-NEXT: movw $1, %ax
6056 ; AVX512DQ-NEXT: kmovw %eax, %k1
6057 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6058 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
6059 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6060 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6061 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
6062 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6063 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
6064 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
6065 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
6066 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z}
6067 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z}
6068 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx)
6069 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx)
6070 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
6071 ; AVX512DQ-NEXT: vzeroupper
6072 ; AVX512DQ-NEXT: retq
6074 ; AVX512BW-LABEL: mask_replication_factor6_vf8:
6075 ; AVX512BW: # %bb.0:
6076 ; AVX512BW-NEXT: kmovw (%rdi), %k1
6077 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
6078 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6079 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
6080 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
6081 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
6082 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6083 ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
6084 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
6085 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
6086 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6087 ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0
6088 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
6089 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
6090 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
6091 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
6092 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
6093 ; AVX512BW-NEXT: vzeroupper
6094 ; AVX512BW-NEXT: retq
6095 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
6096 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
6097 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
6098 %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison)
6099 store <48 x i32> %data, ptr %out.vec, align 64
6103 define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
6104 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16:
6105 ; AVX512F-ONLY: # %bb.0:
6106 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
6107 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
6108 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6109 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6110 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
6111 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
6112 ; AVX512F-ONLY-NEXT: movw $1, %ax
6113 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
6114 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6115 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
6116 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6117 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6118 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
6119 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6120 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6121 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
6122 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
6123 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6124 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
6125 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
6126 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
6127 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
6128 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
6129 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
6130 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
6131 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
6132 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
6133 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
6134 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
6135 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
6136 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
6137 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
6138 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx)
6139 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx)
6140 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx)
6141 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx)
6142 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
6143 ; AVX512F-ONLY-NEXT: vzeroupper
6144 ; AVX512F-ONLY-NEXT: retq
6146 ; AVX512DQ-LABEL: mask_replication_factor6_vf16:
6147 ; AVX512DQ: # %bb.0:
6148 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
6149 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
6150 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6151 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6152 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
6153 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
6154 ; AVX512DQ-NEXT: movw $1, %ax
6155 ; AVX512DQ-NEXT: kmovw %eax, %k1
6156 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
6157 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
6158 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6159 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6160 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
6161 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6162 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6163 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
6164 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
6165 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6166 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
6167 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
6168 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
6169 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
6170 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
6171 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
6172 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
6173 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
6174 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z}
6175 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z}
6176 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z}
6177 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z}
6178 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
6179 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
6180 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx)
6181 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx)
6182 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
6183 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx)
6184 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
6185 ; AVX512DQ-NEXT: vzeroupper
6186 ; AVX512DQ-NEXT: retq
6188 ; AVX512BW-LABEL: mask_replication_factor6_vf16:
6189 ; AVX512BW: # %bb.0:
6190 ; AVX512BW-NEXT: kmovw (%rdi), %k1
6191 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
6192 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6193 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
6194 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
6195 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
6196 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
6197 ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
6198 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
6199 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z}
6200 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
6201 ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
6202 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
6203 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
6204 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
6205 ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
6206 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
6207 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
6208 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6209 ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5
6210 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1
6211 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
6212 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6213 ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0
6214 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
6215 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
6216 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
6217 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx)
6218 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
6219 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx)
6220 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx)
6221 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
6222 ; AVX512BW-NEXT: vzeroupper
6223 ; AVX512BW-NEXT: retq
6224 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
6225 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
6226 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
6227 %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison)
6228 store <96 x i32> %data, ptr %out.vec, align 64
6232 define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
6233 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32:
6234 ; AVX512F-ONLY: # %bb.0:
6235 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
6236 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
6237 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6238 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
6239 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
6240 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
6241 ; AVX512F-ONLY-NEXT: movw $1, %ax
6242 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
6243 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
6244 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
6245 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
6246 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
6247 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6248 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
6249 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6250 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
6251 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
6252 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
6253 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
6254 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10
6255 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
6256 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0
6257 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
6258 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
6259 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
6260 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
6261 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9
6262 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3
6263 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
6264 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
6265 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
6266 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
6267 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
6268 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
6269 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
6270 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
6271 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
6272 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
6273 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
6274 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
6275 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
6276 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
6277 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
6278 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
6279 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
6280 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
6281 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
6282 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
6283 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
6284 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
6285 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
6286 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
6287 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
6288 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
6289 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx)
6290 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx)
6291 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx)
6292 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx)
6293 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx)
6294 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx)
6295 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx)
6296 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx)
6297 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx)
6298 ; AVX512F-ONLY-NEXT: vzeroupper
6299 ; AVX512F-ONLY-NEXT: retq
6301 ; AVX512DQ-LABEL: mask_replication_factor6_vf32:
6302 ; AVX512DQ: # %bb.0:
6303 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
6304 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
6305 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
6306 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
6307 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
6308 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
6309 ; AVX512DQ-NEXT: movw $1, %ax
6310 ; AVX512DQ-NEXT: kmovw %eax, %k1
6311 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
6312 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
6313 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
6314 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
6315 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
6316 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
6317 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
6318 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
6319 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
6320 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
6321 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
6322 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10
6323 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
6324 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0
6325 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
6326 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
6327 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
6328 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
6329 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9
6330 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3
6331 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z}
6332 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
6333 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
6334 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
6335 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z}
6336 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
6337 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z}
6338 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
6339 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z}
6340 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
6341 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
6342 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
6343 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z}
6344 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
6345 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z}
6346 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
6347 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
6348 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
6349 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
6350 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
6351 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
6352 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
6353 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
6354 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
6355 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
6356 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
6357 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx)
6358 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx)
6359 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx)
6360 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx)
6361 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx)
6362 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx)
6363 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx)
6364 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx)
6365 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx)
6366 ; AVX512DQ-NEXT: vzeroupper
6367 ; AVX512DQ-NEXT: retq
6369 ; AVX512BW-LABEL: mask_replication_factor6_vf32:
6370 ; AVX512BW: # %bb.0:
6371 ; AVX512BW-NEXT: kmovd (%rdi), %k5
6372 ; AVX512BW-NEXT: movw $-3, %ax
6373 ; AVX512BW-NEXT: kmovd %eax, %k0
6374 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6375 ; AVX512BW-NEXT: kmovw (%rdi), %k1
6376 ; AVX512BW-NEXT: kandw %k0, %k1, %k2
6377 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
6378 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
6379 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6380 ; AVX512BW-NEXT: movw $-5, %ax
6381 ; AVX512BW-NEXT: kmovd %eax, %k0
6382 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6383 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6384 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
6385 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6386 ; AVX512BW-NEXT: movw $-9, %ax
6387 ; AVX512BW-NEXT: kmovd %eax, %k0
6388 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6389 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6390 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
6391 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6392 ; AVX512BW-NEXT: movw $-17, %ax
6393 ; AVX512BW-NEXT: kmovd %eax, %k7
6394 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
6395 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6396 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
6397 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6398 ; AVX512BW-NEXT: movw $-33, %ax
6399 ; AVX512BW-NEXT: kmovd %eax, %k0
6400 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6401 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6402 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
6403 ; AVX512BW-NEXT: korw %k1, %k2, %k1
6404 ; AVX512BW-NEXT: movw $-65, %ax
6405 ; AVX512BW-NEXT: kmovd %eax, %k0
6406 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6407 ; AVX512BW-NEXT: kandw %k0, %k1, %k2
6408 ; AVX512BW-NEXT: kshiftrd $1, %k5, %k1
6409 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
6410 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
6411 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6412 ; AVX512BW-NEXT: movw $-129, %ax
6413 ; AVX512BW-NEXT: kmovd %eax, %k0
6414 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6415 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6416 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
6417 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6418 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
6419 ; AVX512BW-NEXT: kmovd %eax, %k0
6420 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6421 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6422 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
6423 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6424 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
6425 ; AVX512BW-NEXT: kmovd %eax, %k0
6426 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6427 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6428 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
6429 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6430 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
6431 ; AVX512BW-NEXT: kmovd %eax, %k6
6432 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
6433 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6434 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
6435 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6436 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
6437 ; AVX512BW-NEXT: kmovd %eax, %k0
6438 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6439 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6440 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
6441 ; AVX512BW-NEXT: korw %k1, %k2, %k1
6442 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
6443 ; AVX512BW-NEXT: kmovd %eax, %k0
6444 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6445 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6446 ; AVX512BW-NEXT: kshiftrd $2, %k5, %k4
6447 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k3
6448 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6449 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
6450 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6451 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
6452 ; AVX512BW-NEXT: kmovd %eax, %k2
6453 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6454 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
6455 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
6456 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6457 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6458 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
6459 ; AVX512BW-NEXT: kmovd %eax, %k2
6460 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6461 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
6462 ; AVX512BW-NEXT: kshiftlw $14, %k4, %k2
6463 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6464 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
6465 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
6466 ; AVX512BW-NEXT: korw %k3, %k1, %k1
6467 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
6468 ; AVX512BW-NEXT: kshiftrd $29, %k5, %k0
6469 ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6470 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6471 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
6472 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
6473 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k4
6474 ; AVX512BW-NEXT: korw %k4, %k1, %k1
6475 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
6476 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
6477 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k4
6478 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6479 ; AVX512BW-NEXT: korw %k4, %k1, %k1
6480 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6481 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
6482 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k4
6483 ; AVX512BW-NEXT: korw %k4, %k1, %k1
6484 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
6485 ; AVX512BW-NEXT: kshiftrd $30, %k5, %k4
6486 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
6487 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
6488 ; AVX512BW-NEXT: korw %k7, %k1, %k1
6489 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6490 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
6491 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
6492 ; AVX512BW-NEXT: korw %k7, %k1, %k1
6493 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6494 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6495 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k7
6496 ; AVX512BW-NEXT: korw %k7, %k1, %k1
6497 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6498 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6499 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
6500 ; AVX512BW-NEXT: korw %k7, %k1, %k1
6501 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6502 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
6503 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
6504 ; AVX512BW-NEXT: korw %k7, %k1, %k1
6505 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6506 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
6507 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
6508 ; AVX512BW-NEXT: korw %k4, %k1, %k1
6509 ; AVX512BW-NEXT: kandw %k6, %k1, %k4
6510 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k7
6511 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1
6512 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
6513 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6514 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6515 ; AVX512BW-NEXT: kandw %k0, %k4, %k4
6516 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
6517 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6518 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6519 ; AVX512BW-NEXT: kandw %k0, %k4, %k4
6520 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
6521 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6522 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6523 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6524 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
6525 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6526 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6527 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6528 ; AVX512BW-NEXT: kshiftlw $14, %k7, %k6
6529 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6530 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
6531 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
6532 ; AVX512BW-NEXT: korw %k1, %k4, %k1
6533 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
6534 ; AVX512BW-NEXT: kshiftrd $26, %k5, %k4
6535 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6536 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6537 ; AVX512BW-NEXT: kandw %k1, %k4, %k6
6538 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k1
6539 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6540 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k7
6541 ; AVX512BW-NEXT: korw %k7, %k6, %k6
6542 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
6543 ; AVX512BW-NEXT: kshiftrd $27, %k5, %k7
6544 ; AVX512BW-NEXT: kmovq %k5, %k2
6545 ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6546 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
6547 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k5
6548 ; AVX512BW-NEXT: korw %k5, %k6, %k5
6549 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6550 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
6551 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
6552 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6553 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6554 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6555 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
6556 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6557 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6558 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6559 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
6560 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6561 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6562 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6563 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
6564 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6565 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6566 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6567 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
6568 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6569 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6570 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6571 ; AVX512BW-NEXT: kshiftrd $28, %k2, %k6
6572 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
6573 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
6574 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6575 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6576 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
6577 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6578 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6579 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6580 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
6581 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6582 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6583 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6584 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
6585 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6586 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6587 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
6588 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6589 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6590 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6591 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
6592 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6593 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
6594 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
6595 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
6596 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k3
6597 ; AVX512BW-NEXT: korw %k3, %k5, %k3
6598 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
6599 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
6600 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
6601 ; AVX512BW-NEXT: korw %k2, %k3, %k2
6602 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
6603 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
6604 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k2
6605 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6606 ; AVX512BW-NEXT: kandw %k3, %k2, %k3
6607 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
6608 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5
6609 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6610 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6611 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
6612 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5
6613 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6614 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
6615 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
6616 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6617 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6618 ; AVX512BW-NEXT: kandw %k6, %k3, %k3
6619 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
6620 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6621 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6622 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
6623 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
6624 ; AVX512BW-NEXT: korw %k2, %k3, %k2
6625 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6626 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
6627 ; AVX512BW-NEXT: kshiftrd $25, %k0, %k3
6628 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
6629 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
6630 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6631 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6632 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6633 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
6634 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6635 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6636 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6637 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
6638 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6639 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6640 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6641 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k5
6642 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6643 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6644 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6645 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k5
6646 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6647 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6648 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6649 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
6650 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6651 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6652 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
6653 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6654 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k3
6655 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6656 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
6657 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k3
6658 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6659 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
6660 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
6661 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
6662 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6663 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
6664 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
6665 ; AVX512BW-NEXT: korw %k4, %k2, %k1
6666 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
6667 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
6668 ; AVX512BW-NEXT: kshiftrd $21, %k7, %k2
6669 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6670 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6671 ; AVX512BW-NEXT: kandw %k0, %k2, %k3
6672 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
6673 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6674 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
6675 ; AVX512BW-NEXT: korw %k4, %k3, %k3
6676 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6677 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6678 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
6679 ; AVX512BW-NEXT: korw %k4, %k3, %k3
6680 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6681 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6682 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
6683 ; AVX512BW-NEXT: korw %k4, %k3, %k3
6684 ; AVX512BW-NEXT: kandw %k6, %k3, %k3
6685 ; AVX512BW-NEXT: kshiftrd $22, %k7, %k4
6686 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
6687 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
6688 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6689 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6690 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6691 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
6692 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6693 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6694 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6695 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
6696 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6697 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6698 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
6699 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
6700 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6701 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
6702 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
6703 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
6704 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6705 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6706 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
6707 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
6708 ; AVX512BW-NEXT: korw %k4, %k3, %k3
6709 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6710 ; AVX512BW-NEXT: kandw %k4, %k3, %k4
6711 ; AVX512BW-NEXT: kshiftrd $23, %k7, %k5
6712 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
6713 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
6714 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6715 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6716 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6717 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
6718 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6719 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6720 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6721 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
6722 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6723 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6724 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6725 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
6726 ; AVX512BW-NEXT: korw %k6, %k4, %k4
6727 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6728 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
6729 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
6730 ; AVX512BW-NEXT: korw %k5, %k4, %k4
6731 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
6732 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
6733 ; AVX512BW-NEXT: korw %k3, %k4, %k3
6734 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z}
6735 ; AVX512BW-NEXT: kmovq %k7, %k4
6736 ; AVX512BW-NEXT: kshiftrd $18, %k7, %k6
6737 ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6738 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6739 ; AVX512BW-NEXT: kandw %k3, %k6, %k5
6740 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k3
6741 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6742 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k6
6743 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6744 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6745 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6746 ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6
6747 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
6748 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
6749 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6750 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6751 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6752 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
6753 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6754 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6755 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6756 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
6757 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6758 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6759 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6760 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
6761 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6762 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6763 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
6764 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6765 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6766 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
6767 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6768 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
6769 ; AVX512BW-NEXT: kshiftrd $20, %k4, %k6
6770 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
6771 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
6772 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6773 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6774 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6775 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
6776 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6777 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6778 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6779 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
6780 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6781 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6782 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6783 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
6784 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6785 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6786 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6787 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
6788 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6789 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6790 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6791 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
6792 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6793 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6794 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6795 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
6796 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
6797 ; AVX512BW-NEXT: korw %k2, %k5, %k2
6798 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
6799 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
6800 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6801 ; AVX512BW-NEXT: korw %k0, %k2, %k1
6802 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
6803 ; AVX512BW-NEXT: kmovq %k4, %k0
6804 ; AVX512BW-NEXT: kshiftrd $16, %k4, %k1
6805 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
6806 ; AVX512BW-NEXT: kandw %k2, %k1, %k2
6807 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
6808 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
6809 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6810 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
6811 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
6812 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
6813 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6814 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
6815 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
6816 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
6817 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6818 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6819 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
6820 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
6821 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6822 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6823 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6824 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
6825 ; AVX512BW-NEXT: korw %k1, %k2, %k1
6826 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6827 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
6828 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
6829 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
6830 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
6831 ; AVX512BW-NEXT: korw %k5, %k1, %k1
6832 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6833 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
6834 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
6835 ; AVX512BW-NEXT: korw %k5, %k1, %k1
6836 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6837 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
6838 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k5
6839 ; AVX512BW-NEXT: korw %k5, %k1, %k1
6840 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
6841 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
6842 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k5
6843 ; AVX512BW-NEXT: korw %k5, %k1, %k1
6844 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6845 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6846 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k5
6847 ; AVX512BW-NEXT: korw %k5, %k1, %k1
6848 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
6849 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
6850 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6851 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6852 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6853 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6854 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
6855 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6856 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6857 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6858 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
6859 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6860 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6861 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
6862 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
6863 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k2
6864 ; AVX512BW-NEXT: korw %k2, %k1, %k1
6865 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
6866 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
6867 ; AVX512BW-NEXT: korw %k3, %k1, %k1
6868 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
6869 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
6870 ; AVX512BW-NEXT: kshiftrd $13, %k0, %k3
6871 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6872 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6873 ; AVX512BW-NEXT: kandw %k1, %k3, %k2
6874 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
6875 ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6876 ; AVX512BW-NEXT: kshiftrw $14, %k5, %k3
6877 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6878 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
6879 ; AVX512BW-NEXT: kshiftrw $13, %k5, %k3
6880 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6881 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
6882 ; AVX512BW-NEXT: kshiftrw $12, %k5, %k3
6883 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6884 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6885 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
6886 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k3
6887 ; AVX512BW-NEXT: kmovq %k0, %k7
6888 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
6889 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
6890 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6891 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6892 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6893 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
6894 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6895 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6896 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
6897 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6898 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6899 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6900 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
6901 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6902 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
6903 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
6904 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
6905 ; AVX512BW-NEXT: korw %k5, %k2, %k2
6906 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6907 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
6908 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
6909 ; AVX512BW-NEXT: korw %k3, %k2, %k2
6910 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6911 ; AVX512BW-NEXT: kandw %k0, %k2, %k3
6912 ; AVX512BW-NEXT: kshiftrd $15, %k7, %k5
6913 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k2
6914 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
6915 ; AVX512BW-NEXT: korw %k6, %k3, %k3
6916 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6917 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6918 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
6919 ; AVX512BW-NEXT: korw %k6, %k3, %k3
6920 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6921 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6922 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
6923 ; AVX512BW-NEXT: korw %k6, %k3, %k3
6924 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6925 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6926 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
6927 ; AVX512BW-NEXT: korw %k6, %k3, %k3
6928 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6929 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
6930 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
6931 ; AVX512BW-NEXT: korw %k5, %k3, %k3
6932 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
6933 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
6934 ; AVX512BW-NEXT: korw %k2, %k3, %k2
6935 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
6936 ; AVX512BW-NEXT: kmovq %k7, %k2
6937 ; AVX512BW-NEXT: kshiftrd $10, %k7, %k0
6938 ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6939 ; AVX512BW-NEXT: kandw %k1, %k0, %k5
6940 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
6941 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
6942 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
6943 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6944 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6945 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6946 ; AVX512BW-NEXT: kshiftrd $11, %k7, %k6
6947 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
6948 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
6949 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6950 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6951 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6952 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
6953 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6954 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
6955 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
6956 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
6957 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6958 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6959 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6960 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
6961 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6962 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6963 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6964 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
6965 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6966 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6967 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6968 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
6969 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6970 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
6971 ; AVX512BW-NEXT: kshiftrd $12, %k2, %k6
6972 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
6973 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
6974 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6975 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6976 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6977 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
6978 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6979 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6980 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6981 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
6982 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6983 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6984 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6985 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
6986 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6987 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
6988 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
6989 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
6990 ; AVX512BW-NEXT: korw %k7, %k5, %k5
6991 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
6992 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
6993 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
6994 ; AVX512BW-NEXT: korw %k6, %k5, %k5
6995 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
6996 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
6997 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
6998 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k4
6999 ; AVX512BW-NEXT: korw %k4, %k5, %k4
7000 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
7001 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
7002 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7003 ; AVX512BW-NEXT: korw %k1, %k4, %k1
7004 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
7005 ; AVX512BW-NEXT: kshiftrd $8, %k2, %k1
7006 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7007 ; AVX512BW-NEXT: kandw %k6, %k1, %k4
7008 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7009 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
7010 ; AVX512BW-NEXT: korw %k5, %k4, %k4
7011 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7012 ; AVX512BW-NEXT: kandw %k5, %k4, %k4
7013 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
7014 ; AVX512BW-NEXT: korw %k5, %k4, %k4
7015 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7016 ; AVX512BW-NEXT: kandw %k5, %k4, %k4
7017 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
7018 ; AVX512BW-NEXT: korw %k5, %k4, %k4
7019 ; AVX512BW-NEXT: kandw %k0, %k4, %k4
7020 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
7021 ; AVX512BW-NEXT: korw %k5, %k4, %k4
7022 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7023 ; AVX512BW-NEXT: kandw %k0, %k4, %k4
7024 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
7025 ; AVX512BW-NEXT: korw %k1, %k4, %k1
7026 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7027 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
7028 ; AVX512BW-NEXT: kshiftrd $9, %k2, %k4
7029 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
7030 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
7031 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7032 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7033 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
7034 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
7035 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7036 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7037 ; AVX512BW-NEXT: kandw %k0, %k1, %k1
7038 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
7039 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7040 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7041 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7042 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k5
7043 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7044 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7045 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7046 ; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
7047 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7048 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7049 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7050 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k4
7051 ; AVX512BW-NEXT: korw %k4, %k1, %k1
7052 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7053 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
7054 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7055 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k4
7056 ; AVX512BW-NEXT: korw %k4, %k1, %k1
7057 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
7058 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k4
7059 ; AVX512BW-NEXT: korw %k4, %k1, %k1
7060 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7061 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
7062 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
7063 ; AVX512BW-NEXT: korw %k2, %k1, %k1
7064 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
7065 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
7066 ; AVX512BW-NEXT: korw %k5, %k1, %k1
7067 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
7068 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
7069 ; AVX512BW-NEXT: kshiftrd $5, %k1, %k2
7070 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7071 ; AVX512BW-NEXT: kandw %k6, %k2, %k3
7072 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
7073 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k4
7074 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7075 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7076 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7077 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k4
7078 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7079 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7080 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7081 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k4
7082 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7083 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7084 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7085 ; AVX512BW-NEXT: kshiftrd $6, %k1, %k4
7086 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
7087 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
7088 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7089 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7090 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7091 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
7092 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7093 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7094 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7095 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
7096 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7097 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7098 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7099 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
7100 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7101 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
7102 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
7103 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7104 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7105 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
7106 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
7107 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7108 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7109 ; AVX512BW-NEXT: kandw %k4, %k3, %k4
7110 ; AVX512BW-NEXT: kshiftrd $7, %k1, %k5
7111 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
7112 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
7113 ; AVX512BW-NEXT: korw %k6, %k4, %k4
7114 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7115 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
7116 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
7117 ; AVX512BW-NEXT: korw %k6, %k4, %k4
7118 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7119 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
7120 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
7121 ; AVX512BW-NEXT: korw %k6, %k4, %k4
7122 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7123 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
7124 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
7125 ; AVX512BW-NEXT: korw %k6, %k4, %k4
7126 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7127 ; AVX512BW-NEXT: kandw %k6, %k4, %k4
7128 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
7129 ; AVX512BW-NEXT: korw %k5, %k4, %k4
7130 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
7131 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
7132 ; AVX512BW-NEXT: korw %k3, %k4, %k3
7133 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z}
7134 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
7135 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7136 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
7137 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7138 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
7139 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7140 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7141 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
7142 ; AVX512BW-NEXT: kshiftrd $3, %k1, %k4
7143 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
7144 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k5
7145 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7146 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7147 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
7148 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k5
7149 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7150 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7151 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
7152 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
7153 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7154 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7155 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
7156 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
7157 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7158 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7159 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
7160 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
7161 ; AVX512BW-NEXT: korw %k5, %k3, %k3
7162 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7163 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
7164 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7165 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
7166 ; AVX512BW-NEXT: kshiftrd $4, %k1, %k0
7167 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
7168 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k4
7169 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7170 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7171 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
7172 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k4
7173 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7174 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7175 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
7176 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k4
7177 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7178 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7179 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
7180 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k4
7181 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7182 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7183 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
7184 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
7185 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7186 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7187 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
7188 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
7189 ; AVX512BW-NEXT: korw %k0, %k3, %k0
7190 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7191 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
7192 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
7193 ; AVX512BW-NEXT: korw %k2, %k0, %k0
7194 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
7195 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
7196 ; AVX512BW-NEXT: korw %k7, %k0, %k1
7197 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z}
7198 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
7199 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx)
7200 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx)
7201 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx)
7202 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx)
7203 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
7204 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx)
7205 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx)
7206 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx)
7207 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx)
7208 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx)
7209 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
7210 ; AVX512BW-NEXT: vzeroupper
7211 ; AVX512BW-NEXT: retq
7212 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
7213 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
7214 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
7215 %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison)
7216 store <192 x i32> %data, ptr %out.vec, align 64
7220 define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
7221 ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64:
7222 ; AVX512F-ONLY: # %bb.0:
7223 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
7224 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
7225 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
7226 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1
7227 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
7228 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
7229 ; AVX512F-ONLY-NEXT: movw $1, %ax
7230 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
7231 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7232 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
7233 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm7 {%k1} {z} = -1
7234 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
7235 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
7236 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
7237 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm9 {%k1} {z} = -1
7238 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
7239 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
7240 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1
7241 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
7242 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2
7243 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
7244 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3
7245 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
7246 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5
7247 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
7248 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6
7249 ; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7
7250 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15
7251 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16
7252 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17
7253 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18
7254 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19
7255 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8
7256 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20
7257 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21
7258 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22
7259 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23
7260 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24
7261 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9
7262 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10
7263 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11
7264 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12
7265 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13
7266 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4
7267 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
7268 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
7269 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
7270 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
7271 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
7272 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
7273 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
7274 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
7275 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
7276 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
7277 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
7278 ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1
7279 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
7280 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
7281 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
7282 ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
7283 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
7284 ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1
7285 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
7286 ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
7287 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
7288 ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1
7289 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
7290 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
7291 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
7292 ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
7293 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
7294 ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
7295 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
7296 ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
7297 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
7298 ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
7299 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
7300 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
7301 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
7302 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
7303 ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
7304 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
7305 ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
7306 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
7307 ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
7308 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
7309 ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
7310 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
7311 ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
7312 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
7313 ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
7314 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx)
7315 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx)
7316 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx)
7317 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx)
7318 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx)
7319 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx)
7320 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx)
7321 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
7322 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx)
7323 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx)
7324 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx)
7325 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx)
7326 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx)
7327 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx)
7328 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx)
7329 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx)
7330 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx)
7331 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx)
7332 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx)
7333 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx)
7334 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx)
7335 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx)
7336 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
7337 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
7338 ; AVX512F-ONLY-NEXT: vzeroupper
7339 ; AVX512F-ONLY-NEXT: retq
7341 ; AVX512DQ-LABEL: mask_replication_factor6_vf64:
7342 ; AVX512DQ: # %bb.0:
7343 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
7344 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
7345 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
7346 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1
7347 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
7348 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
7349 ; AVX512DQ-NEXT: movw $1, %ax
7350 ; AVX512DQ-NEXT: kmovw %eax, %k1
7351 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7352 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
7353 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7
7354 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
7355 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
7356 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
7357 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9
7358 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
7359 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
7360 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1
7361 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13]
7362 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2
7363 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
7364 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3
7365 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7]
7366 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5
7367 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5]
7368 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6
7369 ; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7
7370 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15
7371 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16
7372 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17
7373 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18
7374 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19
7375 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8
7376 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20
7377 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21
7378 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22
7379 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23
7380 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24
7381 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9
7382 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10
7383 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11
7384 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12
7385 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13
7386 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4
7387 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
7388 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
7389 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
7390 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
7391 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z}
7392 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
7393 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
7394 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
7395 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z}
7396 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
7397 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z}
7398 ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1
7399 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z}
7400 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
7401 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z}
7402 ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
7403 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z}
7404 ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1
7405 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z}
7406 ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
7407 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z}
7408 ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1
7409 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z}
7410 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
7411 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
7412 ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
7413 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z}
7414 ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
7415 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z}
7416 ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
7417 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z}
7418 ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
7419 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
7420 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
7421 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z}
7422 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
7423 ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z}
7424 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
7425 ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z}
7426 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
7427 ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z}
7428 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
7429 ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z}
7430 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
7431 ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z}
7432 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
7433 ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z}
7434 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx)
7435 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx)
7436 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx)
7437 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx)
7438 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx)
7439 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx)
7440 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx)
7441 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
7442 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx)
7443 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx)
7444 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx)
7445 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx)
7446 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx)
7447 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx)
7448 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx)
7449 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx)
7450 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx)
7451 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx)
7452 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx)
7453 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx)
7454 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx)
7455 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx)
7456 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
7457 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
7458 ; AVX512DQ-NEXT: vzeroupper
7459 ; AVX512DQ-NEXT: retq
7461 ; AVX512BW-LABEL: mask_replication_factor6_vf64:
7462 ; AVX512BW: # %bb.0:
7463 ; AVX512BW-NEXT: kmovq (%rdi), %k5
7464 ; AVX512BW-NEXT: movw $-3, %ax
7465 ; AVX512BW-NEXT: kmovd %eax, %k1
7466 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7467 ; AVX512BW-NEXT: kmovw (%rdi), %k0
7468 ; AVX512BW-NEXT: kandw %k1, %k0, %k3
7469 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
7470 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k0
7471 ; AVX512BW-NEXT: korw %k0, %k3, %k0
7472 ; AVX512BW-NEXT: movw $-5, %ax
7473 ; AVX512BW-NEXT: kmovd %eax, %k2
7474 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7475 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7476 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
7477 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7478 ; AVX512BW-NEXT: movw $-9, %ax
7479 ; AVX512BW-NEXT: kmovd %eax, %k2
7480 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7481 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7482 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
7483 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7484 ; AVX512BW-NEXT: movw $-17, %ax
7485 ; AVX512BW-NEXT: kmovd %eax, %k2
7486 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7487 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7488 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
7489 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7490 ; AVX512BW-NEXT: movw $-33, %ax
7491 ; AVX512BW-NEXT: kmovd %eax, %k2
7492 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7493 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7494 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
7495 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7496 ; AVX512BW-NEXT: movw $-65, %ax
7497 ; AVX512BW-NEXT: kmovd %eax, %k1
7498 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7499 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7500 ; AVX512BW-NEXT: kshiftrq $1, %k5, %k1
7501 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7502 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
7503 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7504 ; AVX512BW-NEXT: movw $-129, %ax
7505 ; AVX512BW-NEXT: kmovd %eax, %k2
7506 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7507 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7508 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
7509 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7510 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
7511 ; AVX512BW-NEXT: kmovd %eax, %k2
7512 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7513 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7514 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
7515 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7516 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
7517 ; AVX512BW-NEXT: kmovd %eax, %k2
7518 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7519 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7520 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
7521 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7522 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
7523 ; AVX512BW-NEXT: kmovd %eax, %k2
7524 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7525 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7526 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
7527 ; AVX512BW-NEXT: korw %k3, %k0, %k0
7528 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
7529 ; AVX512BW-NEXT: kmovd %eax, %k2
7530 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7531 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7532 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
7533 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7534 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
7535 ; AVX512BW-NEXT: kmovd %eax, %k1
7536 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7537 ; AVX512BW-NEXT: kandw %k1, %k0, %k3
7538 ; AVX512BW-NEXT: kshiftrq $2, %k5, %k1
7539 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
7540 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
7541 ; AVX512BW-NEXT: korw %k4, %k3, %k3
7542 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
7543 ; AVX512BW-NEXT: kmovd %eax, %k2
7544 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7545 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
7546 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
7547 ; AVX512BW-NEXT: korw %k7, %k3, %k7
7548 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
7549 ; AVX512BW-NEXT: kmovd %eax, %k2
7550 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
7551 ; AVX512BW-NEXT: kandw %k2, %k7, %k7
7552 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
7553 ; AVX512BW-NEXT: korw %k6, %k7, %k6
7554 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
7555 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
7556 ; AVX512BW-NEXT: korw %k0, %k6, %k6
7557 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z}
7558 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7559 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
7560 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
7561 ; AVX512BW-NEXT: korw %k0, %k1, %k0
7562 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7563 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7564 ; AVX512BW-NEXT: kmovq %k5, %k3
7565 ; AVX512BW-NEXT: kshiftrq $3, %k5, %k1
7566 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7567 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
7568 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7569 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7570 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7571 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
7572 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7573 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7574 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7575 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
7576 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7577 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7578 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7579 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
7580 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7581 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7582 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7583 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7584 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7585 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7586 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7587 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
7588 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7589 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7590 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7591 ; AVX512BW-NEXT: kshiftrq $4, %k3, %k1
7592 ; AVX512BW-NEXT: kmovq %k3, %k7
7593 ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
7594 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7595 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7596 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7597 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7598 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7599 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
7600 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7601 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7602 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7603 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
7604 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7605 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7606 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7607 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
7608 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7609 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7610 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7611 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
7612 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7613 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7614 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7615 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
7616 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7617 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7618 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7619 ; AVX512BW-NEXT: kshiftrq $5, %k7, %k1
7620 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
7621 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7622 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
7623 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
7624 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
7625 ; AVX512BW-NEXT: korw %k6, %k0, %k7
7626 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z}
7627 ; AVX512BW-NEXT: kandw %k4, %k1, %k0
7628 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
7629 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7630 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7631 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7632 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
7633 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7634 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7635 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
7636 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7637 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7638 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7639 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
7640 ; AVX512BW-NEXT: kshiftrq $6, %k7, %k1
7641 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7642 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
7643 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7644 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7645 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7646 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
7647 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7648 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7649 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7650 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7651 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7652 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7653 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7654 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
7655 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7656 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7657 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7658 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7659 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7660 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7661 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
7662 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7663 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7664 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
7665 ; AVX512BW-NEXT: kshiftrq $7, %k7, %k6
7666 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
7667 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
7668 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7669 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7670 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7671 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
7672 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7673 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7674 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7675 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
7676 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7677 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7678 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7679 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
7680 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7681 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7682 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
7683 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7684 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
7685 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
7686 ; AVX512BW-NEXT: korw %k0, %k1, %k1
7687 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
7688 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
7689 ; AVX512BW-NEXT: kshiftrq $8, %k7, %k0
7690 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7691 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
7692 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
7693 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
7694 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7695 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
7696 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
7697 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7698 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7699 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7700 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
7701 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7702 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7703 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7704 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
7705 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7706 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
7707 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
7708 ; AVX512BW-NEXT: korw %k0, %k1, %k0
7709 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7710 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7711 ; AVX512BW-NEXT: kshiftrq $9, %k7, %k1
7712 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7713 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7714 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7715 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7716 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7717 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
7718 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7719 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7720 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7721 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7722 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7723 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7724 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7725 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
7726 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7727 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7728 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7729 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
7730 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7731 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7732 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7733 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
7734 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7735 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7736 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7737 ; AVX512BW-NEXT: kshiftrq $10, %k7, %k1
7738 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
7739 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
7740 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7741 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7742 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7743 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
7744 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7745 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
7746 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
7747 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
7748 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7749 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
7750 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
7751 ; AVX512BW-NEXT: korw %k6, %k0, %k7
7752 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
7753 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7754 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
7755 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
7756 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7757 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7758 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7759 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
7760 ; AVX512BW-NEXT: kshiftrq $11, %k7, %k1
7761 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7762 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
7763 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7764 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7765 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7766 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
7767 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7768 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7769 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
7770 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7771 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7772 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7773 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
7774 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7775 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7776 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7777 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7778 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7779 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7780 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
7781 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7782 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7783 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7784 ; AVX512BW-NEXT: kshiftrq $12, %k7, %k1
7785 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7786 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7787 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7788 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7789 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7790 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
7791 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7792 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7793 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
7794 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7795 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7796 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7797 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
7798 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7799 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7800 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7801 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
7802 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7803 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7804 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
7805 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7806 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7807 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7808 ; AVX512BW-NEXT: kshiftrq $13, %k7, %k1
7809 ; AVX512BW-NEXT: kmovq %k7, %k2
7810 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
7811 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7812 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
7813 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
7814 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
7815 ; AVX512BW-NEXT: korw %k6, %k0, %k7
7816 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z}
7817 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7818 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
7819 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
7820 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7821 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7822 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7823 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
7824 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7825 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7826 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7827 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
7828 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7829 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7830 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7831 ; AVX512BW-NEXT: kmovq %k2, %k7
7832 ; AVX512BW-NEXT: kshiftrq $14, %k2, %k1
7833 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7834 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
7835 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7836 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7837 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7838 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
7839 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7840 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7841 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7842 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7843 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7844 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7845 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
7846 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7847 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7848 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7849 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7850 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7851 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7852 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
7853 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7854 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7855 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
7856 ; AVX512BW-NEXT: kshiftrq $15, %k7, %k6
7857 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
7858 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
7859 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7860 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7861 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
7862 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
7863 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7864 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
7865 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
7866 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7867 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7868 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7869 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
7870 ; AVX512BW-NEXT: korw %k7, %k1, %k1
7871 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7872 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
7873 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
7874 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7875 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
7876 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
7877 ; AVX512BW-NEXT: korw %k0, %k1, %k1
7878 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
7879 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
7880 ; AVX512BW-NEXT: kshiftrq $16, %k7, %k0
7881 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7882 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
7883 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
7884 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
7885 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7886 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7887 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
7888 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
7889 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7890 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7891 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
7892 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
7893 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7894 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7895 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
7896 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
7897 ; AVX512BW-NEXT: korw %k6, %k1, %k1
7898 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7899 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
7900 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
7901 ; AVX512BW-NEXT: korw %k0, %k1, %k0
7902 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7903 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7904 ; AVX512BW-NEXT: kshiftrq $17, %k7, %k1
7905 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7906 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7907 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7908 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7909 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7910 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
7911 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7912 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7913 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7914 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7915 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7916 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7917 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7918 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
7919 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7920 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7921 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
7922 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7923 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7924 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
7925 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7926 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7927 ; AVX512BW-NEXT: kmovq %k7, %k4
7928 ; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
7929 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
7930 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
7931 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7932 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
7933 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7934 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
7935 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7936 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
7937 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
7938 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
7939 ; AVX512BW-NEXT: korw %k7, %k0, %k0
7940 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
7941 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
7942 ; AVX512BW-NEXT: korw %k6, %k0, %k7
7943 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z}
7944 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
7945 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
7946 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
7947 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7948 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7949 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7950 ; AVX512BW-NEXT: kshiftrq $19, %k4, %k1
7951 ; AVX512BW-NEXT: kmovq %k4, %k7
7952 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7953 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
7954 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7955 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
7956 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
7957 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
7958 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7959 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
7960 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
7961 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
7962 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7963 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7964 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7965 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
7966 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7967 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7968 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7969 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
7970 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7971 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7972 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7973 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
7974 ; AVX512BW-NEXT: korw %k1, %k0, %k0
7975 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
7976 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
7977 ; AVX512BW-NEXT: kshiftrq $20, %k7, %k1
7978 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
7979 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
7980 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7981 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7982 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7983 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
7984 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7985 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7986 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7987 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
7988 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7989 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7990 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7991 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
7992 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7993 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
7994 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
7995 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
7996 ; AVX512BW-NEXT: korw %k6, %k0, %k0
7997 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
7998 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
7999 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8000 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8001 ; AVX512BW-NEXT: kshiftrq $21, %k7, %k1
8002 ; AVX512BW-NEXT: kmovq %k7, %k3
8003 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8004 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8005 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8006 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8007 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8008 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8009 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
8010 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8011 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
8012 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8013 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8014 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8015 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8016 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8017 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8018 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8019 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8020 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8021 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8022 ; AVX512BW-NEXT: kmovq %k3, %k5
8023 ; AVX512BW-NEXT: kshiftrq $22, %k3, %k1
8024 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8025 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8026 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8027 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8028 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8029 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8030 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8031 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8032 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8033 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8034 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8035 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8036 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8037 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8038 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8039 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8040 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8041 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8042 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8043 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8044 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8045 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
8046 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8047 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8048 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
8049 ; AVX512BW-NEXT: kshiftrq $23, %k5, %k6
8050 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
8051 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
8052 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8053 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8054 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8055 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
8056 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8057 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8058 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8059 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
8060 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8061 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8062 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8063 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
8064 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8065 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8066 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8067 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
8068 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8069 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
8070 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
8071 ; AVX512BW-NEXT: korw %k0, %k1, %k1
8072 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
8073 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
8074 ; AVX512BW-NEXT: kshiftrq $24, %k5, %k0
8075 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
8076 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
8077 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
8078 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8079 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8080 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8081 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
8082 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8083 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8084 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8085 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
8086 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8087 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8088 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8089 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
8090 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8091 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
8092 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
8093 ; AVX512BW-NEXT: korw %k0, %k1, %k0
8094 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8095 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8096 ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1
8097 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8098 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8099 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8100 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8101 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8102 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8103 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8104 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8105 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8106 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8107 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8108 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8109 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8110 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8111 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8112 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8113 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8114 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8115 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8116 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8117 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
8118 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8119 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8120 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8121 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
8122 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8123 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
8124 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8125 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8126 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8127 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
8128 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8129 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8130 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8131 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
8132 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8133 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8134 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8135 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8136 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z}
8137 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8138 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8139 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8140 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8141 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8142 ; AVX512BW-NEXT: kmovq %k5, %k7
8143 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
8144 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8145 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
8146 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8147 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8148 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8149 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
8150 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8151 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8152 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8153 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8154 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8155 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8156 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8157 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8158 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8159 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8160 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8161 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8162 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8163 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8164 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8165 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
8166 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8167 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8168 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8169 ; AVX512BW-NEXT: kshiftrq $28, %k7, %k1
8170 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8171 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8172 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8173 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8174 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8175 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8176 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8177 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8178 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8179 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8180 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8181 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8182 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8183 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
8184 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8185 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8186 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8187 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
8188 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8189 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8190 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8191 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
8192 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8193 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8194 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8195 ; AVX512BW-NEXT: kshiftrq $29, %k7, %k1
8196 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8197 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8198 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8199 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8200 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8201 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8202 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
8203 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8204 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8205 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8206 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8207 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8208 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8209 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8210 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8211 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8212 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8213 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8214 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8215 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8216 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
8217 ; AVX512BW-NEXT: kshiftrq $30, %k7, %k1
8218 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8219 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8220 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8221 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8222 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8223 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8224 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8225 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8226 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8227 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8228 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8229 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8230 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8231 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8232 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8233 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8234 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8235 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8236 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8237 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
8238 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8239 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
8240 ; AVX512BW-NEXT: kshiftrq $31, %k7, %k6
8241 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
8242 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
8243 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8244 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8245 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
8246 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
8247 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8248 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8249 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
8250 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
8251 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8252 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8253 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8254 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
8255 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8256 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8257 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8258 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
8259 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8260 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
8261 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
8262 ; AVX512BW-NEXT: korw %k0, %k1, %k1
8263 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
8264 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
8265 ; AVX512BW-NEXT: kshiftrq $32, %k7, %k0
8266 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8267 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
8268 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
8269 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
8270 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8271 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8272 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8273 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
8274 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8275 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8276 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8277 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
8278 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8279 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8280 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8281 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
8282 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8283 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8284 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8285 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
8286 ; AVX512BW-NEXT: korw %k0, %k1, %k0
8287 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8288 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8289 ; AVX512BW-NEXT: kshiftrq $33, %k7, %k1
8290 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8291 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8292 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8293 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8294 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8295 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8296 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8297 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8298 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8299 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8300 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8301 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8302 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8303 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8304 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8305 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8306 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8307 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8308 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8309 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
8310 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8311 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8312 ; AVX512BW-NEXT: kmovq %k7, %k5
8313 ; AVX512BW-NEXT: kshiftrq $34, %k7, %k1
8314 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8315 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
8316 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8317 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8318 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8319 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
8320 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8321 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8322 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8323 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
8324 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8325 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8326 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8327 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8328 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
8329 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8330 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8331 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8332 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8333 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8334 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8335 ; AVX512BW-NEXT: kshiftrq $35, %k5, %k1
8336 ; AVX512BW-NEXT: kmovq %k5, %k7
8337 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8338 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
8339 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8340 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8341 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8342 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
8343 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8344 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8345 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8346 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8347 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8348 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8349 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8350 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8351 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8352 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8353 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8354 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8355 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8356 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8357 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8358 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
8359 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8360 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8361 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8362 ; AVX512BW-NEXT: kshiftrq $36, %k7, %k1
8363 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8364 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8365 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8366 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8367 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8368 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8369 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8370 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8371 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8372 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8373 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8374 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8375 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
8376 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8377 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8378 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8379 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
8380 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8381 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8382 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
8383 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8384 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8385 ; AVX512BW-NEXT: kshiftrq $37, %k7, %k1
8386 ; AVX512BW-NEXT: kmovq %k7, %k3
8387 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8388 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8389 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8390 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8391 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8392 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8393 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z}
8394 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8395 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8396 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8397 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8398 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8399 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8400 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8401 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8402 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8403 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8404 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8405 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8406 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8407 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8408 ; AVX512BW-NEXT: kmovq %k3, %k7
8409 ; AVX512BW-NEXT: kshiftrq $38, %k3, %k1
8410 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8411 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8412 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8413 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8414 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8415 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8416 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8417 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8418 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8419 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8420 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8421 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8422 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8423 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8424 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8425 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8426 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8427 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8428 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8429 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8430 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
8431 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8432 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
8433 ; AVX512BW-NEXT: kshiftrq $39, %k7, %k6
8434 ; AVX512BW-NEXT: kmovq %k7, %k5
8435 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
8436 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
8437 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8438 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8439 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8440 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
8441 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8442 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8443 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8444 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
8445 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8446 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8447 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8448 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
8449 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8450 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8451 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8452 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
8453 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8454 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
8455 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
8456 ; AVX512BW-NEXT: korw %k0, %k1, %k1
8457 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
8458 ; AVX512BW-NEXT: kshiftrq $40, %k5, %k0
8459 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8460 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
8461 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
8462 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
8463 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8464 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8465 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8466 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
8467 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8468 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
8469 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
8470 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8471 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8472 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8473 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
8474 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8475 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
8476 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
8477 ; AVX512BW-NEXT: korw %k0, %k1, %k0
8478 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8479 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8480 ; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
8481 ; AVX512BW-NEXT: kmovq %k5, %k4
8482 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8483 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8484 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8485 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8486 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8487 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8488 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8489 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8490 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8491 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8492 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8493 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8494 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8495 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8496 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8497 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8498 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8499 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8500 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8501 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8502 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8503 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
8504 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8505 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8506 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8507 ; AVX512BW-NEXT: kshiftrq $42, %k4, %k1
8508 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8509 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
8510 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8511 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8512 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8513 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
8514 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8515 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8516 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8517 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
8518 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8519 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8520 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8521 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8522 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
8523 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8524 ; AVX512BW-NEXT: kandw %k5, %k1, %k0
8525 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8526 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8527 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8528 ; AVX512BW-NEXT: kmovq %k4, %k7
8529 ; AVX512BW-NEXT: kshiftrq $43, %k4, %k1
8530 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8531 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
8532 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8533 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8534 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8535 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
8536 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8537 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8538 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8539 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8540 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8541 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8542 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8543 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8544 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8545 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8546 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8547 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8548 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8549 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8550 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8551 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
8552 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8553 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8554 ; AVX512BW-NEXT: kshiftrq $44, %k7, %k1
8555 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8556 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8557 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8558 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8559 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8560 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8561 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8562 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8563 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8564 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8565 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8566 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8567 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8568 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
8569 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8570 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8571 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8572 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
8573 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8574 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8575 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8576 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
8577 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8578 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8579 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8580 ; AVX512BW-NEXT: kshiftrq $45, %k7, %k1
8581 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8582 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8583 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8584 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8585 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8586 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8587 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
8588 ; AVX512BW-NEXT: kandw %k5, %k1, %k0
8589 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8590 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8591 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8592 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8593 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8594 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8595 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8596 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8597 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8598 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8599 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8600 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
8601 ; AVX512BW-NEXT: kshiftrq $46, %k5, %k1
8602 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8603 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8604 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8605 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8606 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8607 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8608 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8609 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8610 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8611 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8612 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8613 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8614 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8615 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8616 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8617 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8618 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8619 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8620 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8621 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
8622 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8623 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8624 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
8625 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k6
8626 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
8627 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
8628 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8629 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8630 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
8631 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
8632 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8633 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8634 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
8635 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
8636 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8637 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8638 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8639 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
8640 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8641 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8642 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8643 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
8644 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8645 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
8646 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
8647 ; AVX512BW-NEXT: korw %k0, %k1, %k1
8648 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
8649 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
8650 ; AVX512BW-NEXT: kshiftrq $48, %k7, %k0
8651 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8652 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
8653 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
8654 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
8655 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8656 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8657 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8658 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
8659 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8660 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8661 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8662 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
8663 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8664 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8665 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8666 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
8667 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8668 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8669 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8670 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
8671 ; AVX512BW-NEXT: korw %k0, %k1, %k0
8672 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8673 ; AVX512BW-NEXT: kshiftrq $49, %k7, %k1
8674 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8675 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8676 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8677 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8678 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8679 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8680 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8681 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8682 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8683 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8684 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8685 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8686 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8687 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8688 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8689 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8690 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8691 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8692 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8693 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8694 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
8695 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8696 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8697 ; AVX512BW-NEXT: kmovq %k7, %k5
8698 ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1
8699 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8700 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
8701 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8702 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8703 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8704 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
8705 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8706 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8707 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8708 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
8709 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8710 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8711 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8712 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8713 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z}
8714 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8715 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8716 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8717 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8718 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8719 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8720 ; AVX512BW-NEXT: kshiftrq $51, %k5, %k1
8721 ; AVX512BW-NEXT: kmovq %k5, %k7
8722 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8723 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
8724 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8725 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8726 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8727 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
8728 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8729 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8730 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8731 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8732 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8733 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8734 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8735 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8736 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8737 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8738 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8739 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8740 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8741 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8742 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
8743 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8744 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8745 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8746 ; AVX512BW-NEXT: kshiftrq $52, %k7, %k1
8747 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8748 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8749 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8750 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8751 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8752 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8753 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8754 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8755 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8756 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8757 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8758 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8759 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8760 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
8761 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8762 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8763 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8764 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
8765 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8766 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8767 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
8768 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8769 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8770 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8771 ; AVX512BW-NEXT: kshiftrq $53, %k7, %k1
8772 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8773 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8774 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8775 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8776 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8777 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8778 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z}
8779 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8780 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8781 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8782 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8783 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8784 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8785 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8786 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8787 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8788 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8789 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8790 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8791 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8792 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
8793 ; AVX512BW-NEXT: kshiftrq $54, %k7, %k1
8794 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8795 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8796 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8797 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8798 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8799 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8800 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8801 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8802 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8803 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8804 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8805 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8806 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
8807 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8808 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8809 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8810 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8811 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8812 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8813 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8814 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
8815 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8816 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8817 ; AVX512BW-NEXT: kandw %k3, %k0, %k1
8818 ; AVX512BW-NEXT: kshiftrq $55, %k7, %k6
8819 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
8820 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
8821 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8822 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8823 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8824 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
8825 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8826 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8827 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8828 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
8829 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8830 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
8831 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
8832 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
8833 ; AVX512BW-NEXT: korw %k7, %k1, %k1
8834 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
8835 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
8836 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8837 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
8838 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
8839 ; AVX512BW-NEXT: korw %k0, %k1, %k1
8840 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
8841 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
8842 ; AVX512BW-NEXT: kshiftrq $56, %k5, %k0
8843 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8844 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
8845 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
8846 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
8847 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8848 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8849 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8850 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
8851 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8852 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8853 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
8854 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
8855 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8856 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
8857 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
8858 ; AVX512BW-NEXT: korw %k6, %k1, %k1
8859 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8860 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
8861 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
8862 ; AVX512BW-NEXT: korw %k0, %k1, %k0
8863 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8864 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8865 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
8866 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8867 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8868 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8869 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8870 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8871 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8872 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8873 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8874 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8875 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8876 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8877 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8878 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8879 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8880 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8881 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8882 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8883 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8884 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8885 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8886 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
8887 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8888 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8889 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8890 ; AVX512BW-NEXT: kshiftrq $58, %k5, %k1
8891 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8892 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
8893 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8894 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
8895 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
8896 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
8897 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8898 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
8899 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
8900 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
8901 ; AVX512BW-NEXT: korw %k7, %k0, %k0
8902 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8903 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8904 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8905 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
8906 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
8907 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8908 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8909 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8910 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8911 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1
8912 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8913 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
8914 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8915 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8916 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8917 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
8918 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8919 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8920 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8921 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8922 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8923 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8924 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8925 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8926 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8927 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8928 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8929 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8930 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8931 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
8932 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
8933 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
8934 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8935 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8936 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8937 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1
8938 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8939 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
8940 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8941 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8942 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
8943 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8944 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8945 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8946 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
8947 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8948 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
8949 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
8950 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
8951 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8952 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8953 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
8954 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8955 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
8956 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
8957 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
8958 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8959 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8960 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8961 ; AVX512BW-NEXT: kshiftrq $61, %k5, %k1
8962 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
8963 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8964 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
8965 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
8966 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
8967 ; AVX512BW-NEXT: korw %k6, %k0, %k7
8968 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z}
8969 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
8970 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
8971 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
8972 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8973 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8974 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8975 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
8976 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8977 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8978 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
8979 ; AVX512BW-NEXT: korw %k1, %k0, %k0
8980 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
8981 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
8982 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k1
8983 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
8984 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
8985 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8986 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8987 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8988 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
8989 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8990 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8991 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8992 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
8993 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8994 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8995 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
8996 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
8997 ; AVX512BW-NEXT: korw %k6, %k0, %k0
8998 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
8999 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9000 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
9001 ; AVX512BW-NEXT: korw %k6, %k0, %k0
9002 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9003 ; AVX512BW-NEXT: kandw %k2, %k0, %k6
9004 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k0
9005 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
9006 ; AVX512BW-NEXT: korw %k1, %k6, %k1
9007 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9008 ; AVX512BW-NEXT: kandw %k2, %k1, %k2
9009 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
9010 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
9011 ; AVX512BW-NEXT: korw %k6, %k2, %k2
9012 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
9013 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
9014 ; AVX512BW-NEXT: korw %k6, %k2, %k2
9015 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
9016 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
9017 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k5
9018 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9019 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
9020 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k4
9021 ; AVX512BW-NEXT: korw %k4, %k2, %k2
9022 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
9023 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
9024 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
9025 ; AVX512BW-NEXT: korw %k0, %k2, %k0
9026 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
9027 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
9028 ; AVX512BW-NEXT: korw %k1, %k0, %k1
9029 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
9030 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx)
9031 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx)
9032 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx)
9033 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx)
9034 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
9035 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
9036 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
9037 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
9038 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
9039 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
9040 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
9041 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
9042 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
9043 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
9044 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
9045 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
9046 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
9047 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
9048 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
9049 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
9050 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
9051 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9052 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
9053 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
9054 ; AVX512BW-NEXT: vzeroupper
9055 ; AVX512BW-NEXT: retq
9056 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
9057 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
9058 %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison)
9059 store <384 x i32> %data, ptr %out.vec, align 64
9063 define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
9064 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2:
9065 ; AVX512F-ONLY: # %bb.0:
9066 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
9067 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9068 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
9069 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
9070 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0
9071 ; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF
9072 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
9073 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
9074 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9075 ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
9076 ; AVX512F-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1
9077 ; AVX512F-ONLY-NEXT: vmovq %xmm1, 48(%rdx)
9078 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
9079 ; AVX512F-ONLY-NEXT: vzeroupper
9080 ; AVX512F-ONLY-NEXT: retq
9082 ; AVX512DQ-LABEL: mask_replication_factor7_vf2:
9083 ; AVX512DQ: # %bb.0:
9084 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
9085 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
9086 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
9087 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
9088 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
9089 ; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF
9090 ; AVX512DQ-NEXT: kmovw %eax, %k1
9091 ; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1}
9092 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9093 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
9094 ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
9095 ; AVX512DQ-NEXT: vmovq %xmm1, 48(%rdx)
9096 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
9097 ; AVX512DQ-NEXT: vzeroupper
9098 ; AVX512DQ-NEXT: retq
9100 ; AVX512BW-LABEL: mask_replication_factor7_vf2:
9101 ; AVX512BW: # %bb.0:
9102 ; AVX512BW-NEXT: kmovw (%rdi), %k1
9103 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9104 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0]
9105 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
9106 ; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0
9107 ; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF
9108 ; AVX512BW-NEXT: kmovd %eax, %k1
9109 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
9110 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9111 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
9112 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
9113 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
9114 ; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx)
9115 ; AVX512BW-NEXT: vzeroupper
9116 ; AVX512BW-NEXT: retq
9117 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
9118 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
9119 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
9120 %data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison)
9121 %data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>
9122 store <14 x i32> %data, ptr %out.vec, align 64
9126 define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
9127 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4:
9128 ; AVX512F-ONLY: # %bb.0:
9129 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
9130 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9131 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
9132 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9133 ; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1
9134 ; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF
9135 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
9136 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
9137 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9138 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
9139 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
9140 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
9141 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
9142 ; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx)
9143 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
9144 ; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx)
9145 ; AVX512F-ONLY-NEXT: vzeroupper
9146 ; AVX512F-ONLY-NEXT: retq
9148 ; AVX512DQ-LABEL: mask_replication_factor7_vf4:
9149 ; AVX512DQ: # %bb.0:
9150 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
9151 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
9152 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
9153 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9154 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
9155 ; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF
9156 ; AVX512DQ-NEXT: kmovw %eax, %k1
9157 ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
9158 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9159 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
9160 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
9161 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
9162 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
9163 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx)
9164 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
9165 ; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx)
9166 ; AVX512DQ-NEXT: vzeroupper
9167 ; AVX512DQ-NEXT: retq
9169 ; AVX512BW-LABEL: mask_replication_factor7_vf4:
9170 ; AVX512BW: # %bb.0:
9171 ; AVX512BW-NEXT: kmovd (%rdi), %k0
9172 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
9173 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0]
9174 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
9175 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
9176 ; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF
9177 ; AVX512BW-NEXT: kmovd %eax, %k1
9178 ; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1}
9179 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9180 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
9181 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
9182 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 96(%rdx)
9183 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
9184 ; AVX512BW-NEXT: vmovdqa %ymm1, 64(%rdx)
9185 ; AVX512BW-NEXT: vzeroupper
9186 ; AVX512BW-NEXT: retq
9187 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
9188 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
9189 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9190 %data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison)
9191 %data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef>
9192 store <28 x i32> %data, ptr %out.vec, align 64
9196 define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
9197 ; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8:
9198 ; AVX512F-SLOW: # %bb.0:
9199 ; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1
9200 ; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9201 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9202 ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
9203 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2
9204 ; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
9205 ; AVX512F-SLOW-NEXT: movw $1, %ax
9206 ; AVX512F-SLOW-NEXT: kmovw %eax, %k2
9207 ; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
9208 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2
9209 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9210 ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
9211 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3
9212 ; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9213 ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
9214 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4
9215 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
9216 ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
9217 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
9218 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
9219 ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1
9220 ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
9221 ; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
9222 ; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
9223 ; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
9224 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9225 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9226 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
9227 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx)
9228 ; AVX512F-SLOW-NEXT: vzeroupper
9229 ; AVX512F-SLOW-NEXT: retq
9231 ; AVX512F-FAST-LABEL: mask_replication_factor7_vf8:
9232 ; AVX512F-FAST: # %bb.0:
9233 ; AVX512F-FAST-NEXT: kmovw (%rdi), %k1
9234 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9235 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9236 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
9237 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2
9238 ; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
9239 ; AVX512F-FAST-NEXT: movw $1, %ax
9240 ; AVX512F-FAST-NEXT: kmovw %eax, %k2
9241 ; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2}
9242 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2
9243 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9244 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
9245 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3
9246 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9247 ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
9248 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4
9249 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
9250 ; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
9251 ; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
9252 ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
9253 ; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1
9254 ; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
9255 ; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z}
9256 ; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z}
9257 ; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z}
9258 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9259 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9260 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx)
9261 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
9262 ; AVX512F-FAST-NEXT: vzeroupper
9263 ; AVX512F-FAST-NEXT: retq
9265 ; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8:
9266 ; AVX512DQ-SLOW: # %bb.0:
9267 ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0
9268 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
9269 ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9270 ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
9271 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
9272 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1
9273 ; AVX512DQ-SLOW-NEXT: movw $1, %ax
9274 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
9275 ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
9276 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
9277 ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9278 ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
9279 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2
9280 ; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9281 ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
9282 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3
9283 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
9284 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7]
9285 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3]
9286 ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4
9287 ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9288 ; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
9289 ; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
9290 ; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
9291 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9292 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9293 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx)
9294 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
9295 ; AVX512DQ-SLOW-NEXT: vzeroupper
9296 ; AVX512DQ-SLOW-NEXT: retq
9298 ; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8:
9299 ; AVX512DQ-FAST: # %bb.0:
9300 ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0
9301 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
9302 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9303 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
9304 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
9305 ; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1
9306 ; AVX512DQ-FAST-NEXT: movw $1, %ax
9307 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
9308 ; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
9309 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
9310 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9311 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
9312 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2
9313 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9314 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
9315 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3
9316 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
9317 ; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7]
9318 ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
9319 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4
9320 ; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9321 ; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z}
9322 ; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z}
9323 ; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z}
9324 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9325 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9326 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx)
9327 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
9328 ; AVX512DQ-FAST-NEXT: vzeroupper
9329 ; AVX512DQ-FAST-NEXT: retq
9331 ; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8:
9332 ; AVX512BW-ONLY: # %bb.0:
9333 ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0
9334 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
9335 ; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0
9336 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u]
9337 ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
9338 ; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
9339 ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
9340 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
9341 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
9342 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
9343 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
9344 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
9345 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
9346 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
9347 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
9348 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9349 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9350 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
9351 ; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx)
9352 ; AVX512BW-ONLY-NEXT: vzeroupper
9353 ; AVX512BW-ONLY-NEXT: retq
9355 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8:
9356 ; AVX512VBMI-ONLY: # %bb.0:
9357 ; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0
9358 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
9359 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u]
9360 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
9361 ; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1
9362 ; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
9363 ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
9364 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
9365 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
9366 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
9367 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
9368 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
9369 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
9370 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
9371 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
9372 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
9373 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
9374 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
9375 ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx)
9376 ; AVX512VBMI-ONLY-NEXT: vzeroupper
9377 ; AVX512VBMI-ONLY-NEXT: retq
9378 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
9379 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
9380 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
9381 %data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison)
9382 %data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
9383 store <56 x i32> %data, ptr %out.vec, align 64
9387 define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
9388 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16:
9389 ; AVX512F-ONLY: # %bb.0:
9390 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
9391 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9392 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9393 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9394 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
9395 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
9396 ; AVX512F-ONLY-NEXT: movw $1, %ax
9397 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
9398 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
9399 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
9400 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9401 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9402 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
9403 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9404 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9405 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
9406 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
9407 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9408 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
9409 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
9410 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9411 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
9412 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
9413 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
9414 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
9415 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
9416 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
9417 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7
9418 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
9419 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
9420 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
9421 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
9422 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
9423 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
9424 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
9425 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx)
9426 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
9427 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx)
9428 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx)
9429 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx)
9430 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx)
9431 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
9432 ; AVX512F-ONLY-NEXT: vzeroupper
9433 ; AVX512F-ONLY-NEXT: retq
9435 ; AVX512DQ-LABEL: mask_replication_factor7_vf16:
9436 ; AVX512DQ: # %bb.0:
9437 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
9438 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
9439 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9440 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9441 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
9442 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
9443 ; AVX512DQ-NEXT: movw $1, %ax
9444 ; AVX512DQ-NEXT: kmovw %eax, %k1
9445 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
9446 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
9447 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9448 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9449 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
9450 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9451 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9452 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
9453 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
9454 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9455 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
9456 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
9457 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9458 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
9459 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
9460 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
9461 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
9462 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
9463 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
9464 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7
9465 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
9466 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z}
9467 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z}
9468 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z}
9469 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z}
9470 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z}
9471 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z}
9472 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx)
9473 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx)
9474 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx)
9475 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx)
9476 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx)
9477 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx)
9478 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
9479 ; AVX512DQ-NEXT: vzeroupper
9480 ; AVX512DQ-NEXT: retq
9482 ; AVX512BW-LABEL: mask_replication_factor7_vf16:
9483 ; AVX512BW: # %bb.0:
9484 ; AVX512BW-NEXT: kmovw (%rdi), %k1
9485 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9486 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9487 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
9488 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1
9489 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
9490 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
9491 ; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2
9492 ; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1
9493 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z}
9494 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
9495 ; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3
9496 ; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1
9497 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z}
9498 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
9499 ; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4
9500 ; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1
9501 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
9502 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
9503 ; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5
9504 ; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1
9505 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z}
9506 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9507 ; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6
9508 ; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1
9509 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
9510 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9511 ; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0
9512 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
9513 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
9514 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
9515 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx)
9516 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx)
9517 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
9518 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx)
9519 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx)
9520 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
9521 ; AVX512BW-NEXT: vzeroupper
9522 ; AVX512BW-NEXT: retq
9523 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
9524 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
9525 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
9526 %data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison)
9527 store <112 x i32> %data, ptr %out.vec, align 64
9531 define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
9532 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32:
9533 ; AVX512F-ONLY: # %bb.0:
9534 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
9535 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
9536 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9537 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
9538 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
9539 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
9540 ; AVX512F-ONLY-NEXT: movw $1, %ax
9541 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
9542 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
9543 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
9544 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
9545 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
9546 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9547 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4
9548 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9549 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6
9550 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
9551 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8
9552 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
9553 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10
9554 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
9555 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12
9556 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
9557 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0
9558 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1
9559 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2
9560 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5
9561 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7
9562 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9
9563 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11
9564 ; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3
9565 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z}
9566 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
9567 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
9568 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
9569 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
9570 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
9571 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
9572 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
9573 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
9574 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
9575 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
9576 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
9577 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
9578 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
9579 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
9580 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
9581 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
9582 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
9583 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
9584 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
9585 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
9586 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
9587 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
9588 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
9589 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
9590 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
9591 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
9592 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx)
9593 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx)
9594 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx)
9595 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx)
9596 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx)
9597 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx)
9598 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx)
9599 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx)
9600 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx)
9601 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx)
9602 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx)
9603 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx)
9604 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx)
9605 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx)
9606 ; AVX512F-ONLY-NEXT: vzeroupper
9607 ; AVX512F-ONLY-NEXT: retq
9609 ; AVX512DQ-LABEL: mask_replication_factor7_vf32:
9610 ; AVX512DQ: # %bb.0:
9611 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
9612 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
9613 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
9614 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
9615 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0
9616 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
9617 ; AVX512DQ-NEXT: movw $1, %ax
9618 ; AVX512DQ-NEXT: kmovw %eax, %k1
9619 ; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
9620 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
9621 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3
9622 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
9623 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
9624 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4
9625 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
9626 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6
9627 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
9628 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8
9629 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
9630 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10
9631 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
9632 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12
9633 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
9634 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0
9635 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1
9636 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2
9637 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5
9638 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7
9639 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9
9640 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11
9641 ; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3
9642 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z}
9643 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
9644 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
9645 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
9646 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z}
9647 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
9648 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z}
9649 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
9650 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z}
9651 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
9652 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
9653 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
9654 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
9655 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
9656 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z}
9657 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
9658 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z}
9659 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
9660 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z}
9661 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
9662 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
9663 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
9664 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z}
9665 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
9666 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z}
9667 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
9668 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z}
9669 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx)
9670 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx)
9671 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx)
9672 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx)
9673 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx)
9674 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx)
9675 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx)
9676 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx)
9677 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx)
9678 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx)
9679 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx)
9680 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx)
9681 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx)
9682 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx)
9683 ; AVX512DQ-NEXT: vzeroupper
9684 ; AVX512DQ-NEXT: retq
9686 ; AVX512BW-LABEL: mask_replication_factor7_vf32:
9687 ; AVX512BW: # %bb.0:
9688 ; AVX512BW-NEXT: movw $-3, %ax
9689 ; AVX512BW-NEXT: kmovd %eax, %k2
9690 ; AVX512BW-NEXT: kmovw (%rdi), %k0
9691 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
9692 ; AVX512BW-NEXT: kmovq %k2, %k3
9693 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9694 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
9695 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
9696 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9697 ; AVX512BW-NEXT: movw $-5, %ax
9698 ; AVX512BW-NEXT: kmovd %eax, %k2
9699 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9700 ; AVX512BW-NEXT: kmovq %k2, %k4
9701 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
9702 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9703 ; AVX512BW-NEXT: movw $-9, %ax
9704 ; AVX512BW-NEXT: kmovd %eax, %k2
9705 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9706 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9707 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k2
9708 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9709 ; AVX512BW-NEXT: movw $-17, %ax
9710 ; AVX512BW-NEXT: kmovd %eax, %k2
9711 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9712 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9713 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k2
9714 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9715 ; AVX512BW-NEXT: movw $-33, %ax
9716 ; AVX512BW-NEXT: kmovd %eax, %k2
9717 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9718 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9719 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k2
9720 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9721 ; AVX512BW-NEXT: movw $-65, %ax
9722 ; AVX512BW-NEXT: kmovd %eax, %k2
9723 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9724 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9725 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
9726 ; AVX512BW-NEXT: korw %k0, %k1, %k0
9727 ; AVX512BW-NEXT: movw $-129, %ax
9728 ; AVX512BW-NEXT: kmovd %eax, %k1
9729 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9730 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
9731 ; AVX512BW-NEXT: kmovd (%rdi), %k6
9732 ; AVX512BW-NEXT: kshiftrd $1, %k6, %k0
9733 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
9734 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
9735 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9736 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
9737 ; AVX512BW-NEXT: kmovd %eax, %k2
9738 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9739 ; AVX512BW-NEXT: kmovq %k2, %k7
9740 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9741 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
9742 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9743 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
9744 ; AVX512BW-NEXT: kmovd %eax, %k5
9745 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
9746 ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9747 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
9748 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9749 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
9750 ; AVX512BW-NEXT: kmovd %eax, %k2
9751 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9752 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9753 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
9754 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9755 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
9756 ; AVX512BW-NEXT: kmovd %eax, %k2
9757 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9758 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9759 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
9760 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9761 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
9762 ; AVX512BW-NEXT: kmovd %eax, %k2
9763 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9764 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9765 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
9766 ; AVX512BW-NEXT: korw %k2, %k1, %k1
9767 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
9768 ; AVX512BW-NEXT: kmovd %eax, %k2
9769 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9770 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
9771 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
9772 ; AVX512BW-NEXT: korw %k0, %k1, %k0
9773 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
9774 ; AVX512BW-NEXT: kmovd %eax, %k1
9775 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9776 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
9777 ; AVX512BW-NEXT: kshiftrd $2, %k6, %k2
9778 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
9779 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k1
9780 ; AVX512BW-NEXT: korw %k1, %k0, %k0
9781 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
9782 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
9783 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
9784 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9785 ; AVX512BW-NEXT: korw %k1, %k0, %k1
9786 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
9787 ; AVX512BW-NEXT: kmovq %k6, %k2
9788 ; AVX512BW-NEXT: kshiftrd $29, %k6, %k1
9789 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
9790 ; AVX512BW-NEXT: kandw %k3, %k1, %k0
9791 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
9792 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9793 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
9794 ; AVX512BW-NEXT: korw %k1, %k0, %k0
9795 ; AVX512BW-NEXT: kmovq %k4, %k6
9796 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9797 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
9798 ; AVX512BW-NEXT: kshiftrd $30, %k2, %k1
9799 ; AVX512BW-NEXT: kmovq %k2, %k4
9800 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
9801 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
9802 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9803 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9804 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9805 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
9806 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9807 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9808 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9809 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
9810 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9811 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9812 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9813 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
9814 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9815 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9816 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9817 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
9818 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9819 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9820 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9821 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
9822 ; AVX512BW-NEXT: korw %k3, %k0, %k0
9823 ; AVX512BW-NEXT: kandw %k7, %k0, %k0
9824 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
9825 ; AVX512BW-NEXT: korw %k1, %k0, %k0
9826 ; AVX512BW-NEXT: kandw %k5, %k0, %k3
9827 ; AVX512BW-NEXT: kshiftrd $31, %k4, %k0
9828 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
9829 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
9830 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
9831 ; AVX512BW-NEXT: korw %k7, %k3, %k3
9832 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
9833 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
9834 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
9835 ; AVX512BW-NEXT: korw %k7, %k3, %k3
9836 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9837 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
9838 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
9839 ; AVX512BW-NEXT: korw %k7, %k3, %k3
9840 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9841 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
9842 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
9843 ; AVX512BW-NEXT: korw %k7, %k3, %k3
9844 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
9845 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
9846 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
9847 ; AVX512BW-NEXT: korw %k7, %k3, %k3
9848 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
9849 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
9850 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
9851 ; AVX512BW-NEXT: korw %k0, %k3, %k0
9852 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
9853 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
9854 ; AVX512BW-NEXT: korw %k1, %k0, %k1
9855 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z}
9856 ; AVX512BW-NEXT: kshiftrd $27, %k4, %k1
9857 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
9858 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
9859 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
9860 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
9861 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
9862 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k7
9863 ; AVX512BW-NEXT: korw %k7, %k0, %k0
9864 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
9865 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k7
9866 ; AVX512BW-NEXT: korw %k7, %k0, %k0
9867 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9868 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
9869 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k7
9870 ; AVX512BW-NEXT: korw %k7, %k0, %k0
9871 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
9872 ; AVX512BW-NEXT: kandw %k3, %k0, %k7
9873 ; AVX512BW-NEXT: kshiftrd $28, %k4, %k0
9874 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
9875 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
9876 ; AVX512BW-NEXT: korw %k6, %k7, %k6
9877 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9878 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
9879 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k7
9880 ; AVX512BW-NEXT: korw %k7, %k6, %k6
9881 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9882 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
9883 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k7
9884 ; AVX512BW-NEXT: korw %k7, %k6, %k6
9885 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9886 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
9887 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k7
9888 ; AVX512BW-NEXT: korw %k7, %k6, %k6
9889 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9890 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
9891 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k7
9892 ; AVX512BW-NEXT: korw %k7, %k6, %k6
9893 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9894 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
9895 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
9896 ; AVX512BW-NEXT: korw %k7, %k6, %k6
9897 ; AVX512BW-NEXT: kandw %k5, %k6, %k6
9898 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
9899 ; AVX512BW-NEXT: korw %k0, %k6, %k0
9900 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
9901 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
9902 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
9903 ; AVX512BW-NEXT: kshiftrw $4, %k7, %k6
9904 ; AVX512BW-NEXT: korw %k6, %k0, %k0
9905 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9906 ; AVX512BW-NEXT: kmovq %k2, %k4
9907 ; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
9908 ; AVX512BW-NEXT: korw %k6, %k0, %k0
9909 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9910 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9911 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
9912 ; AVX512BW-NEXT: korw %k6, %k0, %k0
9913 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9914 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
9915 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
9916 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k5
9917 ; AVX512BW-NEXT: korw %k5, %k0, %k0
9918 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
9919 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
9920 ; AVX512BW-NEXT: korw %k7, %k0, %k2
9921 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z}
9922 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
9923 ; AVX512BW-NEXT: kshiftrd $25, %k6, %k0
9924 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
9925 ; AVX512BW-NEXT: kandw %k2, %k0, %k2
9926 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
9927 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
9928 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9929 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
9930 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
9931 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
9932 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9933 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
9934 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
9935 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
9936 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9937 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
9938 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
9939 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9940 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
9941 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
9942 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
9943 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9944 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
9945 ; AVX512BW-NEXT: kandw %k3, %k2, %k5
9946 ; AVX512BW-NEXT: kshiftrd $26, %k6, %k2
9947 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
9948 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
9949 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9950 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
9951 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
9952 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
9953 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9954 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
9955 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
9956 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
9957 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9958 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
9959 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
9960 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9961 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9962 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
9963 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
9964 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9965 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9966 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
9967 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
9968 ; AVX512BW-NEXT: korw %k6, %k5, %k5
9969 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
9970 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
9971 ; AVX512BW-NEXT: korw %k2, %k5, %k2
9972 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
9973 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
9974 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
9975 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k5
9976 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9977 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9978 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
9979 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
9980 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k3
9981 ; AVX512BW-NEXT: korw %k3, %k2, %k2
9982 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
9983 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
9984 ; AVX512BW-NEXT: korw %k6, %k2, %k1
9985 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
9986 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
9987 ; AVX512BW-NEXT: kshiftrd $23, %k6, %k1
9988 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
9989 ; AVX512BW-NEXT: kshiftrd $22, %k6, %k5
9990 ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
9991 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9992 ; AVX512BW-NEXT: kandw %k1, %k5, %k2
9993 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k5
9994 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9995 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
9996 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
9997 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k5
9998 ; AVX512BW-NEXT: korw %k5, %k2, %k2
9999 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10000 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10001 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k5
10002 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10003 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10004 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
10005 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
10006 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10007 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
10008 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
10009 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10010 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10011 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
10012 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
10013 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10014 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10015 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
10016 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
10017 ; AVX512BW-NEXT: korw %k3, %k2, %k2
10018 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10019 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
10020 ; AVX512BW-NEXT: kshiftrd $24, %k6, %k3
10021 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
10022 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
10023 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10024 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10025 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10026 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
10027 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10028 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10029 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10030 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
10031 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10032 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10033 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10034 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
10035 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10036 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10037 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10038 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
10039 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10040 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10041 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
10042 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10043 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10044 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10045 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
10046 ; AVX512BW-NEXT: korw %k3, %k2, %k2
10047 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
10048 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
10049 ; AVX512BW-NEXT: korw %k0, %k2, %k2
10050 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z}
10051 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
10052 ; AVX512BW-NEXT: kshiftrd $20, %k3, %k5
10053 ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10054 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10055 ; AVX512BW-NEXT: kandw %k0, %k5, %k2
10056 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k6
10057 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10058 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
10059 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10060 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10061 ; AVX512BW-NEXT: kandw %k0, %k2, %k2
10062 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
10063 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10064 ; AVX512BW-NEXT: kandw %k1, %k2, %k5
10065 ; AVX512BW-NEXT: kshiftrd $21, %k3, %k2
10066 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
10067 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
10068 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10069 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10070 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10071 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
10072 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10073 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10074 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10075 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
10076 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10077 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
10078 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
10079 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10080 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10081 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10082 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
10083 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10084 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10085 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10086 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
10087 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10088 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10089 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10090 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
10091 ; AVX512BW-NEXT: korw %k2, %k5, %k2
10092 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10093 ; AVX512BW-NEXT: kandw %k0, %k2, %k5
10094 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
10095 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k2
10096 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
10097 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10098 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10099 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10100 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
10101 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10102 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10103 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10104 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
10105 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10106 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10107 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10108 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
10109 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10110 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
10111 ; AVX512BW-NEXT: kshiftlw $14, %k7, %k1
10112 ; AVX512BW-NEXT: korw %k1, %k5, %k1
10113 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
10114 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
10115 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10116 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
10117 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
10118 ; AVX512BW-NEXT: kshiftrd $18, %k2, %k4
10119 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10120 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10121 ; AVX512BW-NEXT: kandw %k1, %k4, %k5
10122 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
10123 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10124 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6
10125 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10126 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10127 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10128 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
10129 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10130 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10131 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10132 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
10133 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10134 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10135 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10136 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
10137 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10138 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10139 ; AVX512BW-NEXT: kandw %k1, %k5, %k6
10140 ; AVX512BW-NEXT: kshiftrd $19, %k2, %k5
10141 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
10142 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
10143 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10144 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10145 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
10146 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
10147 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10148 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10149 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
10150 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
10151 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10152 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10153 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
10154 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
10155 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10156 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10157 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
10158 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
10159 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10160 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10161 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
10162 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7
10163 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10164 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
10165 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
10166 ; AVX512BW-NEXT: korw %k5, %k6, %k5
10167 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10168 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10169 ; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
10170 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10171 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10172 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10173 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
10174 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10175 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10176 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10177 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
10178 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
10179 ; AVX512BW-NEXT: korw %k3, %k5, %k3
10180 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
10181 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
10182 ; AVX512BW-NEXT: korw %k7, %k3, %k3
10183 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z}
10184 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10185 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k0
10186 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10187 ; AVX512BW-NEXT: kandw %k6, %k0, %k3
10188 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10189 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
10190 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10191 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10192 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
10193 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
10194 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10195 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10196 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
10197 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
10198 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10199 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10200 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
10201 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
10202 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10203 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10204 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
10205 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
10206 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10207 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10208 ; AVX512BW-NEXT: kandw %k5, %k3, %k3
10209 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
10210 ; AVX512BW-NEXT: korw %k0, %k3, %k0
10211 ; AVX512BW-NEXT: kandw %k4, %k0, %k3
10212 ; AVX512BW-NEXT: kshiftrd $17, %k1, %k0
10213 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10214 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k5
10215 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10216 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10217 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
10218 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k5
10219 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10220 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10221 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
10222 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5
10223 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10224 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
10225 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5
10226 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10227 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10228 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
10229 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k5
10230 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10231 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10232 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
10233 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
10234 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10235 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10236 ; AVX512BW-NEXT: kandw %k1, %k3, %k3
10237 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
10238 ; AVX512BW-NEXT: korw %k0, %k3, %k0
10239 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10240 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
10241 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10242 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
10243 ; AVX512BW-NEXT: korw %k2, %k0, %k0
10244 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
10245 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
10246 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10247 ; AVX512BW-NEXT: korw %k1, %k0, %k1
10248 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z}
10249 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
10250 ; AVX512BW-NEXT: kshiftrd $13, %k0, %k2
10251 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10252 ; AVX512BW-NEXT: kandw %k6, %k2, %k1
10253 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
10254 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10255 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k3
10256 ; AVX512BW-NEXT: korw %k3, %k1, %k1
10257 ; AVX512BW-NEXT: kandw %k7, %k1, %k3
10258 ; AVX512BW-NEXT: kshiftrd $14, %k0, %k1
10259 ; AVX512BW-NEXT: kmovq %k0, %k6
10260 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
10261 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
10262 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10263 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10264 ; AVX512BW-NEXT: kandw %k7, %k3, %k3
10265 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
10266 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10267 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10268 ; AVX512BW-NEXT: kandw %k0, %k3, %k3
10269 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
10270 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10271 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10272 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
10273 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5
10274 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10275 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10276 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
10277 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5
10278 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10279 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10280 ; AVX512BW-NEXT: kandw %k2, %k3, %k3
10281 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k5
10282 ; AVX512BW-NEXT: korw %k5, %k3, %k3
10283 ; AVX512BW-NEXT: kandw %k4, %k3, %k3
10284 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
10285 ; AVX512BW-NEXT: korw %k1, %k3, %k1
10286 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10287 ; AVX512BW-NEXT: kandw %k4, %k1, %k5
10288 ; AVX512BW-NEXT: kshiftrd $15, %k6, %k3
10289 ; AVX512BW-NEXT: kmovq %k6, %k0
10290 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k1
10291 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
10292 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10293 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10294 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
10295 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
10296 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10297 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10298 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
10299 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
10300 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10301 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10302 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
10303 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
10304 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10305 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10306 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
10307 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
10308 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10309 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10310 ; AVX512BW-NEXT: kandw %k6, %k5, %k5
10311 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
10312 ; AVX512BW-NEXT: korw %k3, %k5, %k3
10313 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
10314 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
10315 ; AVX512BW-NEXT: korw %k1, %k3, %k1
10316 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z}
10317 ; AVX512BW-NEXT: kmovq %k0, %k3
10318 ; AVX512BW-NEXT: kshiftrd $11, %k0, %k0
10319 ; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10320 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10321 ; AVX512BW-NEXT: kandw %k1, %k0, %k5
10322 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10323 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
10324 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10325 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10326 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10327 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10328 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
10329 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10330 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
10331 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
10332 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10333 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10334 ; AVX512BW-NEXT: kandw %k0, %k5, %k6
10335 ; AVX512BW-NEXT: kshiftrd $12, %k3, %k5
10336 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
10337 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7
10338 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10339 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10340 ; AVX512BW-NEXT: kandw %k1, %k6, %k6
10341 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
10342 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10343 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10344 ; AVX512BW-NEXT: kandw %k0, %k6, %k6
10345 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
10346 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10347 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10348 ; AVX512BW-NEXT: kandw %k0, %k6, %k6
10349 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
10350 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10351 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10352 ; AVX512BW-NEXT: kandw %k0, %k6, %k6
10353 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
10354 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10355 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
10356 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
10357 ; AVX512BW-NEXT: korw %k7, %k6, %k6
10358 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10359 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
10360 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5
10361 ; AVX512BW-NEXT: korw %k5, %k6, %k5
10362 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10363 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10364 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10365 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
10366 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10367 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10368 ; AVX512BW-NEXT: kandw %k0, %k5, %k5
10369 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
10370 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10371 ; AVX512BW-NEXT: kandw %k2, %k5, %k5
10372 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
10373 ; AVX512BW-NEXT: kmovq %k3, %k0
10374 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10375 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10376 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10377 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
10378 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
10379 ; AVX512BW-NEXT: korw %k2, %k5, %k2
10380 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
10381 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
10382 ; AVX512BW-NEXT: korw %k0, %k2, %k2
10383 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z}
10384 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
10385 ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0
10386 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10387 ; AVX512BW-NEXT: kandw %k2, %k0, %k2
10388 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10389 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
10390 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10391 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10392 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
10393 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
10394 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10395 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10396 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
10397 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
10398 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10399 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10400 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
10401 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
10402 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10403 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10404 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
10405 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10406 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10407 ; AVX512BW-NEXT: kandw %k1, %k2, %k5
10408 ; AVX512BW-NEXT: kshiftrd $10, %k6, %k2
10409 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
10410 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
10411 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10412 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10413 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10414 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
10415 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10416 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10417 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10418 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
10419 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10420 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10421 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10422 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
10423 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10424 ; AVX512BW-NEXT: kandw %k4, %k5, %k5
10425 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
10426 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10427 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10428 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10429 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
10430 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10431 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10432 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10433 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
10434 ; AVX512BW-NEXT: korw %k2, %k5, %k2
10435 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10436 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10437 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10438 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k5
10439 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10440 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
10441 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10442 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
10443 ; AVX512BW-NEXT: korw %k1, %k2, %k1
10444 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
10445 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
10446 ; AVX512BW-NEXT: korw %k4, %k1, %k1
10447 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
10448 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
10449 ; AVX512BW-NEXT: kshiftrd $7, %k4, %k1
10450 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
10451 ; AVX512BW-NEXT: kshiftrd $6, %k4, %k2
10452 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10453 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10454 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10455 ; AVX512BW-NEXT: kshiftrw $14, %k3, %k5
10456 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10457 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10458 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10459 ; AVX512BW-NEXT: kshiftrw $13, %k3, %k5
10460 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10461 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10462 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10463 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k5
10464 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10465 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
10466 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
10467 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10468 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10469 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10470 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
10471 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10472 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10473 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10474 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
10475 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10476 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10477 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10478 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
10479 ; AVX512BW-NEXT: korw %k3, %k2, %k2
10480 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10481 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10482 ; AVX512BW-NEXT: kshiftrd $8, %k4, %k3
10483 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
10484 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
10485 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10486 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
10487 ; AVX512BW-NEXT: kandw %k1, %k2, %k2
10488 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
10489 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10490 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10491 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10492 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
10493 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10494 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10495 ; AVX512BW-NEXT: kandw %k7, %k2, %k2
10496 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
10497 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10498 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10499 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10500 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
10501 ; AVX512BW-NEXT: korw %k6, %k2, %k2
10502 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10503 ; AVX512BW-NEXT: kandw %k6, %k2, %k2
10504 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
10505 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10506 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
10507 ; AVX512BW-NEXT: kandw %k5, %k2, %k2
10508 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
10509 ; AVX512BW-NEXT: korw %k3, %k2, %k2
10510 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
10511 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
10512 ; AVX512BW-NEXT: korw %k0, %k2, %k2
10513 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z}
10514 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
10515 ; AVX512BW-NEXT: kshiftrd $4, %k6, %k3
10516 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
10517 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
10518 ; AVX512BW-NEXT: kandw %k0, %k3, %k2
10519 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k0
10520 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
10521 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10522 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10523 ; AVX512BW-NEXT: kandw %k3, %k2, %k2
10524 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
10525 ; AVX512BW-NEXT: korw %k5, %k2, %k2
10526 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10527 ; AVX512BW-NEXT: kandw %k3, %k2, %k5
10528 ; AVX512BW-NEXT: kshiftrd $5, %k6, %k2
10529 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
10530 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
10531 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10532 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10533 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10534 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
10535 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10536 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10537 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10538 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
10539 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10540 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10541 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10542 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
10543 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10544 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10545 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10546 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
10547 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10548 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10549 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10550 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
10551 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10552 ; AVX512BW-NEXT: kandw %k1, %k5, %k5
10553 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
10554 ; AVX512BW-NEXT: korw %k2, %k5, %k2
10555 ; AVX512BW-NEXT: kandw %k4, %k2, %k5
10556 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10557 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
10558 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
10559 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10560 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
10561 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
10562 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10563 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
10564 ; AVX512BW-NEXT: kandw %k3, %k5, %k5
10565 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
10566 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10567 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
10568 ; AVX512BW-NEXT: kandw %k7, %k5, %k5
10569 ; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
10570 ; AVX512BW-NEXT: korw %k6, %k5, %k5
10571 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
10572 ; AVX512BW-NEXT: kandw %k6, %k5, %k5
10573 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
10574 ; AVX512BW-NEXT: korw %k1, %k5, %k1
10575 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
10576 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
10577 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10578 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z}
10579 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10580 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10581 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10582 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10583 ; AVX512BW-NEXT: kshiftrw $14, %k4, %k2
10584 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10585 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10586 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10587 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k2
10588 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10589 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10590 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10591 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k2
10592 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10593 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10594 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10595 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k2
10596 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10597 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
10598 ; AVX512BW-NEXT: kandw %k2, %k1, %k2
10599 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
10600 ; AVX512BW-NEXT: kshiftrd $3, %k1, %k1
10601 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
10602 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k4
10603 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10604 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10605 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10606 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k4
10607 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10608 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10609 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10610 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k4
10611 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10612 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10613 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10614 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k4
10615 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10616 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10617 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10618 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k4
10619 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10620 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10621 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10622 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k4
10623 ; AVX512BW-NEXT: korw %k4, %k2, %k2
10624 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
10625 ; AVX512BW-NEXT: kandw %k4, %k2, %k2
10626 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
10627 ; AVX512BW-NEXT: korw %k1, %k2, %k1
10628 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
10629 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
10630 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10631 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
10632 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k2
10633 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10634 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
10635 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
10636 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
10637 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10638 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
10639 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
10640 ; AVX512BW-NEXT: korw %k0, %k1, %k1
10641 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
10642 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx)
10643 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx)
10644 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx)
10645 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx)
10646 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx)
10647 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx)
10648 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
10649 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx)
10650 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx)
10651 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx)
10652 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx)
10653 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx)
10654 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx)
10655 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
10656 ; AVX512BW-NEXT: vzeroupper
10657 ; AVX512BW-NEXT: retq
10658 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
10659 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
10660 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
10661 %data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison)
10662 store <224 x i32> %data, ptr %out.vec, align 64
10666 define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
10667 ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64:
10668 ; AVX512F-ONLY: # %bb.0:
10669 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
10670 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
10671 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
10672 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0
10673 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
10674 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
10675 ; AVX512F-ONLY-NEXT: movw $1, %ax
10676 ; AVX512F-ONLY-NEXT: kmovw %eax, %k1
10677 ; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
10678 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
10679 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
10680 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
10681 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm9 {%k1} {z} = -1
10682 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
10683 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm11 {%k1} {z} = -1
10684 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
10685 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
10686 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0
10687 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
10688 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2
10689 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
10690 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3
10691 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
10692 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4
10693 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
10694 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6
10695 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
10696 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7
10697 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8
10698 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10
10699 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12
10700 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14
10701 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20
10702 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21
10703 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22
10704 ; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23
10705 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24
10706 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25
10707 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26
10708 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27
10709 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28
10710 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29
10711 ; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30
10712 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31
10713 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15
10714 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13
10715 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11
10716 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9
10717 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5
10718 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
10719 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
10720 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
10721 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
10722 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
10723 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
10724 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
10725 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
10726 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
10727 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
10728 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
10729 ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
10730 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
10731 ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
10732 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
10733 ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
10734 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
10735 ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
10736 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
10737 ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1
10738 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
10739 ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1
10740 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
10741 ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1
10742 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
10743 ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1
10744 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
10745 ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
10746 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
10747 ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1
10748 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
10749 ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
10750 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
10751 ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1
10752 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
10753 ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
10754 ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
10755 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
10756 ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
10757 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
10758 ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
10759 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
10760 ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
10761 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
10762 ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
10763 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
10764 ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
10765 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
10766 ; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
10767 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
10768 ; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
10769 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
10770 ; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
10771 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
10772 ; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
10773 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx)
10774 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx)
10775 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx)
10776 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx)
10777 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx)
10778 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx)
10779 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx)
10780 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx)
10781 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx)
10782 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx)
10783 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx)
10784 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx)
10785 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx)
10786 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx)
10787 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx)
10788 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx)
10789 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx)
10790 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx)
10791 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx)
10792 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx)
10793 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx)
10794 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx)
10795 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx)
10796 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx)
10797 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx)
10798 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx)
10799 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx)
10800 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
10801 ; AVX512F-ONLY-NEXT: vzeroupper
10802 ; AVX512F-ONLY-NEXT: retq
10804 ; AVX512DQ-LABEL: mask_replication_factor7_vf64:
10805 ; AVX512DQ: # %bb.0:
10806 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
10807 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
10808 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2]
10809 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0
10810 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
10811 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
10812 ; AVX512DQ-NEXT: movw $1, %ax
10813 ; AVX512DQ-NEXT: kmovw %eax, %k1
10814 ; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
10815 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
10816 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
10817 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
10818 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9
10819 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
10820 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11
10821 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
10822 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15]
10823 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0
10824 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13]
10825 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2
10826 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11]
10827 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3
10828 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9]
10829 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4
10830 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6]
10831 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6
10832 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4]
10833 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7
10834 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8
10835 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10
10836 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12
10837 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14
10838 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20
10839 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21
10840 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22
10841 ; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23
10842 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24
10843 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25
10844 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26
10845 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27
10846 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28
10847 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29
10848 ; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30
10849 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31
10850 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15
10851 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13
10852 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11
10853 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9
10854 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5
10855 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
10856 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
10857 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z}
10858 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
10859 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z}
10860 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
10861 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z}
10862 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
10863 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z}
10864 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
10865 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z}
10866 ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
10867 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z}
10868 ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
10869 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z}
10870 ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
10871 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z}
10872 ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
10873 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z}
10874 ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1
10875 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z}
10876 ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1
10877 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z}
10878 ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1
10879 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z}
10880 ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1
10881 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z}
10882 ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
10883 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z}
10884 ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1
10885 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z}
10886 ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
10887 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z}
10888 ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1
10889 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z}
10890 ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
10891 ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z}
10892 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
10893 ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z}
10894 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
10895 ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
10896 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
10897 ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z}
10898 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
10899 ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z}
10900 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
10901 ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z}
10902 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
10903 ; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z}
10904 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
10905 ; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z}
10906 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
10907 ; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z}
10908 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
10909 ; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z}
10910 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx)
10911 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx)
10912 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx)
10913 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx)
10914 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx)
10915 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx)
10916 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx)
10917 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx)
10918 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx)
10919 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx)
10920 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx)
10921 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx)
10922 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx)
10923 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx)
10924 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx)
10925 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx)
10926 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx)
10927 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx)
10928 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx)
10929 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx)
10930 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx)
10931 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx)
10932 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx)
10933 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx)
10934 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx)
10935 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx)
10936 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx)
10937 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
10938 ; AVX512DQ-NEXT: vzeroupper
10939 ; AVX512DQ-NEXT: retq
10941 ; AVX512BW-LABEL: mask_replication_factor7_vf64:
10942 ; AVX512BW: # %bb.0:
10943 ; AVX512BW-NEXT: movw $-3, %ax
10944 ; AVX512BW-NEXT: kmovd %eax, %k1
10945 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10946 ; AVX512BW-NEXT: kmovw (%rdi), %k0
10947 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
10948 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10949 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
10950 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10951 ; AVX512BW-NEXT: movw $-5, %ax
10952 ; AVX512BW-NEXT: kmovd %eax, %k2
10953 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10954 ; AVX512BW-NEXT: kmovq %k2, %k3
10955 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10956 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
10957 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10958 ; AVX512BW-NEXT: movw $-9, %ax
10959 ; AVX512BW-NEXT: kmovd %eax, %k2
10960 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10961 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10962 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k2
10963 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10964 ; AVX512BW-NEXT: movw $-17, %ax
10965 ; AVX512BW-NEXT: kmovd %eax, %k2
10966 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10967 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10968 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k2
10969 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10970 ; AVX512BW-NEXT: movw $-33, %ax
10971 ; AVX512BW-NEXT: kmovd %eax, %k2
10972 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10973 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10974 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k2
10975 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10976 ; AVX512BW-NEXT: movw $-65, %ax
10977 ; AVX512BW-NEXT: kmovd %eax, %k2
10978 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10979 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10980 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
10981 ; AVX512BW-NEXT: korw %k0, %k1, %k0
10982 ; AVX512BW-NEXT: movw $-129, %ax
10983 ; AVX512BW-NEXT: kmovd %eax, %k1
10984 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10985 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
10986 ; AVX512BW-NEXT: kmovq (%rdi), %k4
10987 ; AVX512BW-NEXT: kshiftrq $1, %k4, %k0
10988 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
10989 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
10990 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10991 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
10992 ; AVX512BW-NEXT: kmovd %eax, %k2
10993 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
10994 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
10995 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
10996 ; AVX512BW-NEXT: korw %k2, %k1, %k1
10997 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
10998 ; AVX512BW-NEXT: kmovd %eax, %k2
10999 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11000 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11001 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
11002 ; AVX512BW-NEXT: korw %k2, %k1, %k1
11003 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
11004 ; AVX512BW-NEXT: kmovd %eax, %k5
11005 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11006 ; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11007 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
11008 ; AVX512BW-NEXT: korw %k2, %k1, %k1
11009 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
11010 ; AVX512BW-NEXT: kmovd %eax, %k2
11011 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11012 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11013 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
11014 ; AVX512BW-NEXT: korw %k2, %k1, %k1
11015 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
11016 ; AVX512BW-NEXT: kmovd %eax, %k2
11017 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11018 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11019 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
11020 ; AVX512BW-NEXT: korw %k2, %k1, %k1
11021 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
11022 ; AVX512BW-NEXT: kmovd %eax, %k2
11023 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11024 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11025 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
11026 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11027 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
11028 ; AVX512BW-NEXT: kmovd %eax, %k1
11029 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
11030 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11031 ; AVX512BW-NEXT: kshiftrq $2, %k4, %k1
11032 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
11033 ; AVX512BW-NEXT: korw %k7, %k0, %k0
11034 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
11035 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
11036 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
11037 ; AVX512BW-NEXT: korw %k7, %k0, %k6
11038 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z}
11039 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11040 ; AVX512BW-NEXT: kandw %k2, %k1, %k0
11041 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1
11042 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11043 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11044 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1
11045 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11046 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11047 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11048 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k1
11049 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11050 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11051 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11052 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1
11053 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11054 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11055 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11056 ; AVX512BW-NEXT: kmovq %k4, %k7
11057 ; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
11058 ; AVX512BW-NEXT: kshiftrq $3, %k4, %k0
11059 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11060 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11061 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11062 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11063 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11064 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11065 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11066 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11067 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11068 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11069 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11070 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11071 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11072 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11073 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11074 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11075 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11076 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11077 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11078 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11079 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11080 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11081 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11082 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11083 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
11084 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11085 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11086 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11087 ; AVX512BW-NEXT: kshiftrq $4, %k7, %k6
11088 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
11089 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
11090 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11091 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11092 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11093 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11094 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11095 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11096 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11097 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
11098 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11099 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11100 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11101 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11102 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
11103 ; AVX512BW-NEXT: kandw %k2, %k6, %k1
11104 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11105 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11106 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11107 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11108 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
11109 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11110 ; AVX512BW-NEXT: kandw %k3, %k0, %k1
11111 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11112 ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0
11113 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11114 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11115 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11116 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11117 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11118 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11119 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11120 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11121 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11122 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11123 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11124 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11125 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11126 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11127 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11128 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11129 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11130 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11131 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11132 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11133 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11134 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11135 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11136 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11137 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
11138 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11139 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11140 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
11141 ; AVX512BW-NEXT: kshiftrq $6, %k7, %k0
11142 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
11143 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
11144 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11145 ; AVX512BW-NEXT: kandw %k5, %k6, %k6
11146 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
11147 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11148 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11149 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11150 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
11151 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11152 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11153 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11154 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
11155 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11156 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11157 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11158 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
11159 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11160 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
11161 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
11162 ; AVX512BW-NEXT: korw %k1, %k6, %k1
11163 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
11164 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11165 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11166 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
11167 ; AVX512BW-NEXT: kshiftrq $7, %k4, %k0
11168 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11169 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11170 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11171 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11172 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11173 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11174 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11175 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11176 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11177 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11178 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11179 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11180 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11181 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11182 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11183 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11184 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11185 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11186 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11187 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11188 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11189 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11190 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11191 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
11192 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11193 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11194 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11195 ; AVX512BW-NEXT: kshiftrq $8, %k4, %k0
11196 ; AVX512BW-NEXT: kmovq %k4, %k5
11197 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
11198 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
11199 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11200 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11201 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11202 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
11203 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11204 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11205 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11206 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
11207 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11208 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11209 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11210 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
11211 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11212 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11213 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11214 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
11215 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11216 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11217 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11218 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
11219 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11220 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11221 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11222 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
11223 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11224 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
11225 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
11226 ; AVX512BW-NEXT: kshiftrq $9, %k5, %k1
11227 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
11228 ; AVX512BW-NEXT: korw %k6, %k0, %k7
11229 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
11230 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
11231 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
11232 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
11233 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11234 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11235 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11236 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
11237 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11238 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
11239 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
11240 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11241 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11242 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
11243 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
11244 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11245 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11246 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11247 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
11248 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11249 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11250 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11251 ; AVX512BW-NEXT: kshiftrq $10, %k5, %k0
11252 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11253 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11254 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11255 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11256 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11257 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11258 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11259 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11260 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11261 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11262 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11263 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11264 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11265 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11266 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11267 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11268 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11269 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11270 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11271 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11272 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
11273 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11274 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11275 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11276 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
11277 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11278 ; AVX512BW-NEXT: kandw %k3, %k0, %k1
11279 ; AVX512BW-NEXT: kshiftrq $11, %k5, %k6
11280 ; AVX512BW-NEXT: kmovq %k5, %k4
11281 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
11282 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11283 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11284 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11285 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11286 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
11287 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11288 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11289 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11290 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11291 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
11292 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11293 ; AVX512BW-NEXT: kandw %k1, %k6, %k1
11294 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11295 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11296 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11297 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11298 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11299 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11300 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11301 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11302 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
11303 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11304 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
11305 ; AVX512BW-NEXT: kmovq %k4, %k7
11306 ; AVX512BW-NEXT: kshiftrq $12, %k4, %k0
11307 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11308 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11309 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11310 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11311 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11312 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11313 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11314 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11315 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11316 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11317 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11318 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11319 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11320 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11321 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11322 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11323 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11324 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11325 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11326 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11327 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11328 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11329 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11330 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11331 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11332 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
11333 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11334 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11335 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
11336 ; AVX512BW-NEXT: kshiftrq $13, %k7, %k0
11337 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
11338 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
11339 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11340 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11341 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
11342 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
11343 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11344 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11345 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
11346 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
11347 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11348 ; AVX512BW-NEXT: kandw %k5, %k6, %k6
11349 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
11350 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11351 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
11352 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
11353 ; AVX512BW-NEXT: korw %k1, %k6, %k6
11354 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z}
11355 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11356 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
11357 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
11358 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11359 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11360 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11361 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
11362 ; AVX512BW-NEXT: kshiftrq $14, %k5, %k0
11363 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11364 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11365 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11366 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11367 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11368 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11369 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11370 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11371 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11372 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11373 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11374 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11375 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11376 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11377 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11378 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11379 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11380 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11381 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11382 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11383 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11384 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11385 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11386 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
11387 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11388 ; AVX512BW-NEXT: kandw %k3, %k0, %k6
11389 ; AVX512BW-NEXT: kshiftrq $15, %k5, %k1
11390 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
11391 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
11392 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11393 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11394 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11395 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
11396 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11397 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11398 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11399 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
11400 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11401 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11402 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11403 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
11404 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11405 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11406 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11407 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11408 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11409 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11410 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
11411 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
11412 ; AVX512BW-NEXT: korw %k1, %k6, %k1
11413 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11414 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11415 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11416 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
11417 ; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
11418 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11419 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11420 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11421 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11422 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11423 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11424 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11425 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11426 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11427 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11428 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11429 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11430 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11431 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11432 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11433 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11434 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11435 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11436 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11437 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11438 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11439 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11440 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
11441 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11442 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11443 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
11444 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k0
11445 ; AVX512BW-NEXT: kmovq %k5, %k7
11446 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11447 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11448 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11449 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11450 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11451 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11452 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11453 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11454 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11455 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11456 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11457 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11458 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11459 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11460 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11461 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11462 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
11463 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11464 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11465 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11466 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
11467 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11468 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11469 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11470 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
11471 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11472 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
11473 ; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
11474 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
11475 ; AVX512BW-NEXT: korw %k6, %k0, %k0
11476 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
11477 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
11478 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
11479 ; AVX512BW-NEXT: korw %k6, %k0, %k7
11480 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z}
11481 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11482 ; AVX512BW-NEXT: kandw %k5, %k1, %k0
11483 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
11484 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11485 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11486 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
11487 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
11488 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11489 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11490 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11491 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
11492 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11493 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11494 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11495 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
11496 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11497 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11498 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11499 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11500 ; AVX512BW-NEXT: kshiftrq $19, %k7, %k0
11501 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11502 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11503 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11504 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11505 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11506 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11507 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11508 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11509 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11510 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11511 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11512 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11513 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11514 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11515 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11516 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11517 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11518 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11519 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11520 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11521 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11522 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11523 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11524 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11525 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
11526 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11527 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11528 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11529 ; AVX512BW-NEXT: kshiftrq $20, %k7, %k6
11530 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
11531 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
11532 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11533 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11534 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11535 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11536 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11537 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11538 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11539 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
11540 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11541 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11542 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11543 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11544 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
11545 ; AVX512BW-NEXT: kandw %k5, %k6, %k1
11546 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11547 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11548 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11549 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
11550 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11551 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11552 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11553 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11554 ; AVX512BW-NEXT: kshiftrq $21, %k7, %k0
11555 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11556 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11557 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11558 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11559 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11560 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11561 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11562 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11563 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11564 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11565 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11566 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11567 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11568 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11569 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11570 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11571 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11572 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11573 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11574 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11575 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11576 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11577 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11578 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
11579 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11580 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11581 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
11582 ; AVX512BW-NEXT: kshiftrq $22, %k7, %k0
11583 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
11584 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
11585 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11586 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11587 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11588 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
11589 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11590 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11591 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11592 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
11593 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11594 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11595 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11596 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
11597 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11598 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11599 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11600 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
11601 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11602 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
11603 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
11604 ; AVX512BW-NEXT: korw %k1, %k6, %k1
11605 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
11606 ; AVX512BW-NEXT: kandw %k5, %k0, %k1
11607 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11608 ; AVX512BW-NEXT: kshiftrq $23, %k7, %k0
11609 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11610 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11611 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11612 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11613 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11614 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11615 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11616 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11617 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11618 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11619 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11620 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11621 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11622 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11623 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11624 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11625 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11626 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11627 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11628 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11629 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11630 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11631 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11632 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11633 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
11634 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11635 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11636 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11637 ; AVX512BW-NEXT: kshiftrq $24, %k7, %k0
11638 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
11639 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
11640 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11641 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11642 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
11643 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11644 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11645 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11646 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
11647 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11648 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11649 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11650 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
11651 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11652 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11653 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11654 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
11655 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11656 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11657 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11658 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
11659 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11660 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11661 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11662 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
11663 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11664 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
11665 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
11666 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
11667 ; AVX512BW-NEXT: kshiftrq $25, %k2, %k1
11668 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
11669 ; AVX512BW-NEXT: korw %k6, %k0, %k7
11670 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
11671 ; AVX512BW-NEXT: kandw %k5, %k1, %k0
11672 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
11673 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11674 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11675 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11676 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
11677 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11678 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11679 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
11680 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11681 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11682 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11683 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
11684 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11685 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11686 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11687 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
11688 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11689 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11690 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11691 ; AVX512BW-NEXT: kmovq %k2, %k7
11692 ; AVX512BW-NEXT: kshiftrq $26, %k2, %k0
11693 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11694 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11695 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11696 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11697 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11698 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11699 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11700 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11701 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11702 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11703 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11704 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11705 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11706 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11707 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11708 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11709 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11710 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11711 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11712 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11713 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11714 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
11715 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11716 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11717 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11718 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
11719 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11720 ; AVX512BW-NEXT: kandw %k4, %k0, %k1
11721 ; AVX512BW-NEXT: kshiftrq $27, %k7, %k6
11722 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
11723 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11724 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11725 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11726 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11727 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
11728 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11729 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11730 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11731 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11732 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
11733 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11734 ; AVX512BW-NEXT: kandw %k1, %k6, %k1
11735 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11736 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11737 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11738 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11739 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11740 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11741 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11742 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11743 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
11744 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11745 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11746 ; AVX512BW-NEXT: kandw %k4, %k0, %k1
11747 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11748 ; AVX512BW-NEXT: kshiftrq $28, %k7, %k0
11749 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11750 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11751 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11752 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11753 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11754 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11755 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11756 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11757 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11758 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11759 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11760 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11761 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11762 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11763 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11764 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11765 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11766 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11767 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11768 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11769 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11770 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11771 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11772 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11773 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
11774 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11775 ; AVX512BW-NEXT: kandw %k2, %k0, %k6
11776 ; AVX512BW-NEXT: kshiftrq $29, %k7, %k0
11777 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
11778 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
11779 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11780 ; AVX512BW-NEXT: kandw %k5, %k6, %k6
11781 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
11782 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11783 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11784 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11785 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
11786 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11787 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11788 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
11789 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
11790 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11791 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
11792 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
11793 ; AVX512BW-NEXT: korw %k1, %k6, %k6
11794 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z}
11795 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11796 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
11797 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
11798 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11799 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11800 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11801 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
11802 ; AVX512BW-NEXT: kshiftrq $30, %k5, %k0
11803 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11804 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11805 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11806 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11807 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11808 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11809 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11810 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11811 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11812 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11813 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11814 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11815 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11816 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11817 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11818 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11819 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11820 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11821 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11822 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11823 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11824 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11825 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11826 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11827 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
11828 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11829 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11830 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
11831 ; AVX512BW-NEXT: kshiftrq $31, %k5, %k1
11832 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
11833 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
11834 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11835 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11836 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
11837 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
11838 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11839 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11840 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
11841 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
11842 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11843 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11844 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
11845 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
11846 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11847 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11848 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11849 ; AVX512BW-NEXT: korw %k7, %k6, %k6
11850 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11851 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
11852 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
11853 ; AVX512BW-NEXT: korw %k1, %k6, %k1
11854 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11855 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11856 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11857 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z}
11858 ; AVX512BW-NEXT: kshiftrq $32, %k5, %k0
11859 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
11860 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11861 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11862 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11863 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11864 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
11865 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
11866 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11867 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11868 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11869 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11870 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11871 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11872 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11873 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
11874 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11875 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11876 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11877 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11878 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11879 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11880 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
11881 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11882 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11883 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11884 ; AVX512BW-NEXT: kshiftrq $33, %k5, %k0
11885 ; AVX512BW-NEXT: kmovq %k5, %k7
11886 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11887 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11888 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11889 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11890 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11891 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11892 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11893 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11894 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11895 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11896 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11897 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11898 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11899 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11900 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11901 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11902 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11903 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
11904 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11905 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11906 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11907 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
11908 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11909 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11910 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11911 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
11912 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11913 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
11914 ; AVX512BW-NEXT: kmovq %k7, %k3
11915 ; AVX512BW-NEXT: kshiftrq $34, %k7, %k1
11916 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
11917 ; AVX512BW-NEXT: korw %k6, %k0, %k0
11918 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
11919 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
11920 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
11921 ; AVX512BW-NEXT: korw %k6, %k0, %k7
11922 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z}
11923 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
11924 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
11925 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
11926 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11927 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
11928 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
11929 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11930 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
11931 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
11932 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
11933 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11934 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11935 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
11936 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
11937 ; AVX512BW-NEXT: korw %k1, %k0, %k0
11938 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11939 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11940 ; AVX512BW-NEXT: kshiftrq $35, %k3, %k0
11941 ; AVX512BW-NEXT: kmovq %k3, %k7
11942 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11943 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
11944 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11945 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11946 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11947 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
11948 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11949 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
11950 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
11951 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
11952 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11953 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
11954 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
11955 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
11956 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11957 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11958 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
11959 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11960 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
11961 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
11962 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
11963 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11964 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11965 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11966 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
11967 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11968 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11969 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
11970 ; AVX512BW-NEXT: kshiftrq $36, %k7, %k6
11971 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
11972 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
11973 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11974 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11975 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11976 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
11977 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11978 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
11979 ; AVX512BW-NEXT: kandw %k7, %k1, %k1
11980 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
11981 ; AVX512BW-NEXT: korw %k7, %k1, %k1
11982 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
11983 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
11984 ; AVX512BW-NEXT: korw %k0, %k1, %k1
11985 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z}
11986 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
11987 ; AVX512BW-NEXT: kandw %k1, %k6, %k1
11988 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
11989 ; AVX512BW-NEXT: korw %k6, %k1, %k1
11990 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
11991 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
11992 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
11993 ; AVX512BW-NEXT: korw %k0, %k1, %k0
11994 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
11995 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
11996 ; AVX512BW-NEXT: kshiftrq $37, %k7, %k0
11997 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
11998 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
11999 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12000 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12001 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12002 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12003 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12004 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12005 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12006 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12007 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12008 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12009 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12010 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12011 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12012 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12013 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12014 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12015 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12016 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12017 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12018 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12019 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12020 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
12021 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12022 ; AVX512BW-NEXT: kandw %k5, %k0, %k6
12023 ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0
12024 ; AVX512BW-NEXT: kmovq %k7, %k5
12025 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
12026 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
12027 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12028 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12029 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
12030 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
12031 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12032 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12033 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12034 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
12035 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12036 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12037 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
12038 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
12039 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12040 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12041 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
12042 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
12043 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12044 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
12045 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
12046 ; AVX512BW-NEXT: korw %k1, %k6, %k1
12047 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
12048 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12049 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12050 ; AVX512BW-NEXT: kmovq %k5, %k7
12051 ; AVX512BW-NEXT: kshiftrq $39, %k5, %k0
12052 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12053 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12054 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12055 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12056 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12057 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12058 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12059 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12060 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12061 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12062 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12063 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12064 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12065 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12066 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12067 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12068 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12069 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12070 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12071 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12072 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12073 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12074 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12075 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12076 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
12077 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12078 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12079 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12080 ; AVX512BW-NEXT: kshiftrq $40, %k7, %k0
12081 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
12082 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
12083 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12084 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12085 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12086 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
12087 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12088 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12089 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12090 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
12091 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12092 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12093 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12094 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
12095 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12096 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12097 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
12098 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12099 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12100 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12101 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
12102 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12103 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12104 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12105 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
12106 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12107 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
12108 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
12109 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
12110 ; AVX512BW-NEXT: kshiftrq $41, %k4, %k1
12111 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
12112 ; AVX512BW-NEXT: korw %k6, %k0, %k7
12113 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
12114 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
12115 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
12116 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
12117 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12118 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12119 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
12120 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
12121 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12122 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12123 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
12124 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
12125 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12126 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12127 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
12128 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
12129 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12130 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
12131 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
12132 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12133 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12134 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12135 ; AVX512BW-NEXT: kshiftrq $42, %k4, %k0
12136 ; AVX512BW-NEXT: kmovq %k4, %k3
12137 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12138 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12139 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12140 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12141 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12142 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12143 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12144 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12145 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12146 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12147 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12148 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12149 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12150 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12151 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12152 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12153 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
12154 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12155 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12156 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12157 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
12158 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12159 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12160 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12161 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
12162 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12163 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
12164 ; AVX512BW-NEXT: kshiftrq $43, %k3, %k6
12165 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
12166 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
12167 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12168 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12169 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12170 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
12171 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12172 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
12173 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
12174 ; AVX512BW-NEXT: korw %k0, %k1, %k1
12175 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z}
12176 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12177 ; AVX512BW-NEXT: kandw %k5, %k6, %k1
12178 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12179 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12180 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12181 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12182 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12183 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12184 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12185 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12186 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
12187 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12188 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12189 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12190 ; AVX512BW-NEXT: kmovq %k3, %k7
12191 ; AVX512BW-NEXT: kshiftrq $44, %k3, %k0
12192 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12193 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12194 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12195 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12196 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12197 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12198 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12199 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12200 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12201 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12202 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12203 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12204 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12205 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12206 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12207 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12208 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12209 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12210 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12211 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12212 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12213 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12214 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12215 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12216 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12217 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
12218 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12219 ; AVX512BW-NEXT: kandw %k4, %k0, %k6
12220 ; AVX512BW-NEXT: kshiftrq $45, %k7, %k0
12221 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
12222 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
12223 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12224 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12225 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12226 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
12227 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12228 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
12229 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
12230 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
12231 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12232 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
12233 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
12234 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
12235 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12236 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
12237 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
12238 ; AVX512BW-NEXT: korw %k1, %k6, %k6
12239 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z}
12240 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
12241 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
12242 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12243 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12244 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12245 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
12246 ; AVX512BW-NEXT: kshiftrq $46, %k5, %k0
12247 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12248 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12249 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12250 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12251 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12252 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12253 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12254 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12255 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12256 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12257 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12258 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12259 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12260 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12261 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12262 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12263 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12264 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12265 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12266 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12267 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12268 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12269 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12270 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12271 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
12272 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12273 ; AVX512BW-NEXT: kandw %k2, %k0, %k6
12274 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1
12275 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
12276 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
12277 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12278 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12279 ; AVX512BW-NEXT: kandw %k3, %k6, %k6
12280 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
12281 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12282 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12283 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
12284 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
12285 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12286 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12287 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
12288 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12289 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12290 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
12291 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
12292 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12293 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12294 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
12295 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
12296 ; AVX512BW-NEXT: korw %k1, %k6, %k1
12297 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
12298 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
12299 ; AVX512BW-NEXT: korw %k0, %k1, %k1
12300 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
12301 ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
12302 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12303 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12304 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12305 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12306 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12307 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12308 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12309 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12310 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12311 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12312 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12313 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12314 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12315 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12316 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12317 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12318 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12319 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12320 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12321 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12322 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12323 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12324 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12325 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
12326 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12327 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12328 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12329 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k0
12330 ; AVX512BW-NEXT: kmovq %k5, %k7
12331 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12332 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12333 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12334 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12335 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12336 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12337 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12338 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12339 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12340 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12341 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12342 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12343 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
12344 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12345 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12346 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12347 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
12348 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12349 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12350 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12351 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
12352 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12353 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12354 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12355 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
12356 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12357 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
12358 ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1
12359 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
12360 ; AVX512BW-NEXT: korw %k6, %k0, %k0
12361 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
12362 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
12363 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
12364 ; AVX512BW-NEXT: korw %k6, %k0, %k7
12365 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
12366 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
12367 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
12368 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
12369 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12370 ; AVX512BW-NEXT: kandw %k4, %k0, %k0
12371 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
12372 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12373 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12374 ; AVX512BW-NEXT: kandw %k2, %k0, %k0
12375 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
12376 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12377 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12378 ; AVX512BW-NEXT: kandw %k5, %k0, %k0
12379 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
12380 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12381 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12382 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12383 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
12384 ; AVX512BW-NEXT: kshiftrq $51, %k7, %k0
12385 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12386 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12387 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12388 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12389 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12390 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12391 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12392 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12393 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12394 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12395 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12396 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12397 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12398 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12399 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12400 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12401 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12402 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12403 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12404 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12405 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12406 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
12407 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12408 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12409 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
12410 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12411 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12412 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12413 ; AVX512BW-NEXT: kshiftrq $52, %k7, %k6
12414 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
12415 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
12416 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12417 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12418 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12419 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
12420 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12421 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12422 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12423 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
12424 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12425 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
12426 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
12427 ; AVX512BW-NEXT: korw %k0, %k1, %k1
12428 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z}
12429 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12430 ; AVX512BW-NEXT: kandw %k3, %k6, %k1
12431 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12432 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12433 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12434 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12435 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
12436 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12437 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
12438 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
12439 ; AVX512BW-NEXT: kshiftrq $53, %k7, %k0
12440 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12441 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12442 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12443 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12444 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12445 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12446 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12447 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12448 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12449 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12450 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12451 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12452 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12453 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12454 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12455 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12456 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12457 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12458 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12459 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12460 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12461 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12462 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12463 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
12464 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12465 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12466 ; AVX512BW-NEXT: kandw %k1, %k0, %k6
12467 ; AVX512BW-NEXT: kshiftrq $54, %k7, %k0
12468 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
12469 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
12470 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12471 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12472 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12473 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
12474 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12475 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12476 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12477 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
12478 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12479 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12480 ; AVX512BW-NEXT: kandw %k4, %k6, %k6
12481 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
12482 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12483 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
12484 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
12485 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
12486 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12487 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
12488 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
12489 ; AVX512BW-NEXT: korw %k1, %k6, %k1
12490 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
12491 ; AVX512BW-NEXT: kandw %k3, %k0, %k1
12492 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
12493 ; AVX512BW-NEXT: kshiftrq $55, %k7, %k0
12494 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12495 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12496 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12497 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12498 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12499 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12500 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12501 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12502 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12503 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12504 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12505 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12506 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12507 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12508 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12509 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12510 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12511 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12512 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12513 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12514 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12515 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12516 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12517 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
12518 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12519 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12520 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12521 ; AVX512BW-NEXT: kshiftrq $56, %k7, %k0
12522 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
12523 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
12524 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12525 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12526 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12527 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
12528 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12529 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12530 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12531 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
12532 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12533 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12534 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12535 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
12536 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12537 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12538 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12539 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
12540 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12541 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12542 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
12543 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12544 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12545 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12546 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
12547 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12548 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
12549 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
12550 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
12551 ; AVX512BW-NEXT: kshiftrq $57, %k4, %k1
12552 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
12553 ; AVX512BW-NEXT: korw %k6, %k0, %k7
12554 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z}
12555 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
12556 ; AVX512BW-NEXT: kandw %k0, %k1, %k0
12557 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
12558 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12559 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12560 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
12561 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
12562 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12563 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12564 ; AVX512BW-NEXT: kandw %k1, %k0, %k0
12565 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
12566 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12567 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
12568 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
12569 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12570 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12571 ; AVX512BW-NEXT: kandw %k3, %k0, %k0
12572 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
12573 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12574 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12575 ; AVX512BW-NEXT: kandw %k2, %k0, %k1
12576 ; AVX512BW-NEXT: kmovq %k4, %k7
12577 ; AVX512BW-NEXT: kshiftrq $58, %k4, %k0
12578 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12579 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12580 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12581 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12582 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12583 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12584 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12585 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12586 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12587 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12588 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12589 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12590 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12591 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12592 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12593 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12594 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12595 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
12596 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12597 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12598 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12599 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
12600 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12601 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12602 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12603 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
12604 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12605 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12606 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12607 ; AVX512BW-NEXT: kshiftrq $59, %k7, %k6
12608 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
12609 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
12610 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12611 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12612 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
12613 ; AVX512BW-NEXT: korw %k7, %k1, %k1
12614 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
12615 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
12616 ; AVX512BW-NEXT: korw %k0, %k1, %k1
12617 ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
12618 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12619 ; AVX512BW-NEXT: kandw %k1, %k6, %k1
12620 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
12621 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12622 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12623 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12624 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12625 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12626 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
12627 ; AVX512BW-NEXT: kandw %k5, %k1, %k1
12628 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
12629 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12630 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12631 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12632 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
12633 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k0
12634 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12635 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12636 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12637 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12638 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12639 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12640 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12641 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12642 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12643 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12644 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12645 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12646 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12647 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12648 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12649 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
12650 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12651 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12652 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12653 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12654 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
12655 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12656 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
12657 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12658 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
12659 ; AVX512BW-NEXT: kandw %k4, %k0, %k6
12660 ; AVX512BW-NEXT: kshiftrq $61, %k5, %k0
12661 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
12662 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
12663 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12664 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12665 ; AVX512BW-NEXT: kandw %k2, %k6, %k6
12666 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
12667 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12668 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
12669 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
12670 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
12671 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12672 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
12673 ; AVX512BW-NEXT: kandw %k7, %k6, %k6
12674 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
12675 ; AVX512BW-NEXT: korw %k7, %k6, %k6
12676 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
12677 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
12678 ; AVX512BW-NEXT: korw %k1, %k6, %k6
12679 ; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z}
12680 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12681 ; AVX512BW-NEXT: kandw %k6, %k0, %k0
12682 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
12683 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12684 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12685 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12686 ; AVX512BW-NEXT: kshiftrq $62, %k5, %k0
12687 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
12688 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
12689 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12690 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12691 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12692 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
12693 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12694 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12695 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12696 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
12697 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12698 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12699 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12700 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
12701 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12702 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12703 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12704 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
12705 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12706 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12707 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12708 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
12709 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12710 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
12711 ; AVX512BW-NEXT: kandw %k6, %k1, %k1
12712 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
12713 ; AVX512BW-NEXT: korw %k0, %k1, %k0
12714 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
12715 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
12716 ; AVX512BW-NEXT: kandw %k1, %k0, %k1
12717 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k0
12718 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
12719 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12720 ; AVX512BW-NEXT: kandw %k3, %k1, %k1
12721 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
12722 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12723 ; AVX512BW-NEXT: kandw %k4, %k1, %k1
12724 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
12725 ; AVX512BW-NEXT: korw %k6, %k1, %k1
12726 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12727 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
12728 ; AVX512BW-NEXT: korw %k4, %k1, %k1
12729 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12730 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12731 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k3
12732 ; AVX512BW-NEXT: korw %k3, %k1, %k1
12733 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
12734 ; AVX512BW-NEXT: kandw %k2, %k1, %k1
12735 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
12736 ; AVX512BW-NEXT: korw %k2, %k1, %k1
12737 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
12738 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
12739 ; AVX512BW-NEXT: korw %k0, %k1, %k1
12740 ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z}
12741 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx)
12742 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx)
12743 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx)
12744 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx)
12745 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx)
12746 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx)
12747 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx)
12748 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx)
12749 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
12750 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx)
12751 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx)
12752 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
12753 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx)
12754 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx)
12755 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx)
12756 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx)
12757 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx)
12758 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx)
12759 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
12760 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx)
12761 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx)
12762 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
12763 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx)
12764 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx)
12765 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
12766 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
12767 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
12768 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
12769 ; AVX512BW-NEXT: vzeroupper
12770 ; AVX512BW-NEXT: retq
12771 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
12772 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
12773 %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison)
12774 store <448 x i32> %data, ptr %out.vec, align 64
12778 define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
12779 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2:
12780 ; AVX512F-ONLY: # %bb.0:
12781 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
12782 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
12783 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12784 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
12785 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
12786 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
12787 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
12788 ; AVX512F-ONLY-NEXT: vzeroupper
12789 ; AVX512F-ONLY-NEXT: retq
12791 ; AVX512DQ-LABEL: mask_replication_factor8_vf2:
12792 ; AVX512DQ: # %bb.0:
12793 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
12794 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
12795 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12796 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
12797 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
12798 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
12799 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
12800 ; AVX512DQ-NEXT: vzeroupper
12801 ; AVX512DQ-NEXT: retq
12803 ; AVX512BW-LABEL: mask_replication_factor8_vf2:
12804 ; AVX512BW: # %bb.0:
12805 ; AVX512BW-NEXT: kmovw (%rdi), %k1
12806 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
12807 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12808 ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
12809 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
12810 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
12811 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
12812 ; AVX512BW-NEXT: vzeroupper
12813 ; AVX512BW-NEXT: retq
12814 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
12815 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1>
12816 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
12817 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison)
12818 store <16 x i32> %data, ptr %out.vec, align 64
12822 define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
12823 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4:
12824 ; AVX512F-ONLY: # %bb.0:
12825 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
12826 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
12827 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
12828 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12829 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
12830 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12831 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
12832 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
12833 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
12834 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
12835 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
12836 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
12837 ; AVX512F-ONLY-NEXT: vzeroupper
12838 ; AVX512F-ONLY-NEXT: retq
12840 ; AVX512DQ-LABEL: mask_replication_factor8_vf4:
12841 ; AVX512DQ: # %bb.0:
12842 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
12843 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
12844 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
12845 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
12846 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
12847 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12848 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
12849 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
12850 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
12851 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
12852 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
12853 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
12854 ; AVX512DQ-NEXT: vzeroupper
12855 ; AVX512DQ-NEXT: retq
12857 ; AVX512BW-LABEL: mask_replication_factor8_vf4:
12858 ; AVX512BW: # %bb.0:
12859 ; AVX512BW-NEXT: kmovd (%rdi), %k0
12860 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
12861 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
12862 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
12863 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
12864 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
12865 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
12866 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
12867 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx)
12868 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
12869 ; AVX512BW-NEXT: vzeroupper
12870 ; AVX512BW-NEXT: retq
12871 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
12872 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
12873 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
12874 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison)
12875 store <32 x i32> %data, ptr %out.vec, align 64
12879 define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
12880 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8:
12881 ; AVX512F-ONLY: # %bb.0:
12882 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
12883 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
12884 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
12885 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12886 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
12887 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
12888 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12889 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
12890 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12891 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12892 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
12893 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
12894 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
12895 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
12896 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
12897 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
12898 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12899 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12900 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12901 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12902 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
12903 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12904 ; AVX512F-ONLY-NEXT: vzeroupper
12905 ; AVX512F-ONLY-NEXT: retq
12907 ; AVX512DQ-LABEL: mask_replication_factor8_vf8:
12908 ; AVX512DQ: # %bb.0:
12909 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
12910 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
12911 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
12912 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
12913 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
12914 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
12915 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
12916 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
12917 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
12918 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
12919 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
12920 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
12921 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
12922 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
12923 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
12924 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
12925 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12926 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12927 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12928 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12929 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
12930 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12931 ; AVX512DQ-NEXT: vzeroupper
12932 ; AVX512DQ-NEXT: retq
12934 ; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8:
12935 ; AVX512BW-ONLY: # %bb.0:
12936 ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
12937 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
12938 ; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0
12939 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
12940 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
12941 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
12942 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
12943 ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
12944 ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
12945 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
12946 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12947 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12948 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12949 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12950 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
12951 ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12952 ; AVX512BW-ONLY-NEXT: vzeroupper
12953 ; AVX512BW-ONLY-NEXT: retq
12955 ; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8:
12956 ; AVX512VBMI-ONLY: # %bb.0:
12957 ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
12958 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
12959 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
12960 ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
12961 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
12962 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
12963 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
12964 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
12965 ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
12966 ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
12967 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
12968 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
12969 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
12970 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
12971 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
12972 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
12973 ; AVX512VBMI-ONLY-NEXT: vzeroupper
12974 ; AVX512VBMI-ONLY-NEXT: retq
12975 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
12976 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
12977 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
12978 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison)
12979 store <64 x i32> %data, ptr %out.vec, align 64
12983 define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
12984 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16:
12985 ; AVX512F-ONLY: # %bb.0:
12986 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
12987 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
12988 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
12989 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12990 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
12991 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
12992 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
12993 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12994 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2
12995 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
12996 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
12997 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
12998 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
12999 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
13000 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4
13001 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13002 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
13003 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
13004 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13005 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
13006 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6
13007 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13008 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
13009 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7
13010 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13011 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
13012 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
13013 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
13014 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
13015 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
13016 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
13017 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
13018 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
13019 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
13020 ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13021 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
13022 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
13023 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
13024 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
13025 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx)
13026 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
13027 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
13028 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
13029 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
13030 ; AVX512F-ONLY-NEXT: vzeroupper
13031 ; AVX512F-ONLY-NEXT: retq
13033 ; AVX512DQ-LABEL: mask_replication_factor8_vf16:
13034 ; AVX512DQ: # %bb.0:
13035 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
13036 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
13037 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
13038 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13039 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
13040 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13041 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13042 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13043 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
13044 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
13045 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13046 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
13047 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
13048 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13049 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4
13050 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13051 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13052 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
13053 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13054 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13055 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6
13056 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13057 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
13058 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
13059 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13060 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
13061 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
13062 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
13063 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
13064 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
13065 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
13066 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
13067 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
13068 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
13069 ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13070 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
13071 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
13072 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
13073 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
13074 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx)
13075 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
13076 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
13077 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
13078 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
13079 ; AVX512DQ-NEXT: vzeroupper
13080 ; AVX512DQ-NEXT: retq
13082 ; AVX512BW-LABEL: mask_replication_factor8_vf16:
13083 ; AVX512BW: # %bb.0:
13084 ; AVX512BW-NEXT: kmovw (%rdi), %k0
13085 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
13086 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
13087 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
13088 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
13089 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
13090 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
13091 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
13092 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
13093 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
13094 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13095 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
13096 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
13097 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
13098 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13099 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
13100 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
13101 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13102 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13103 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
13104 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
13105 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
13106 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx)
13107 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
13108 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
13109 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
13110 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
13111 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
13112 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
13113 ; AVX512BW-NEXT: vzeroupper
13114 ; AVX512BW-NEXT: retq
13115 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
13116 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
13117 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
13118 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison)
13119 store <128 x i32> %data, ptr %out.vec, align 64
13123 define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
13124 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32:
13125 ; AVX512F-ONLY: # %bb.0:
13126 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
13127 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
13128 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
13129 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13130 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0
13131 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
13132 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4
13133 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
13134 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6
13135 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
13136 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8
13137 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13138 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10
13139 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13140 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12
13141 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13142 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14
13143 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13144 ; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1
13145 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm16 {%k1} {z} = -1
13146 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2
13147 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3
13148 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5
13149 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7
13150 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9
13151 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm11, %zmm11
13152 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm13, %zmm13
13153 ; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm15, %zmm15
13154 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
13155 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z}
13156 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
13157 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
13158 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
13159 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
13160 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
13161 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
13162 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
13163 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
13164 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
13165 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
13166 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
13167 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
13168 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
13169 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
13170 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
13171 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
13172 ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
13173 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
13174 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
13175 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
13176 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
13177 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
13178 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
13179 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
13180 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
13181 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
13182 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
13183 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
13184 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
13185 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
13186 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx)
13187 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx)
13188 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx)
13189 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx)
13190 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx)
13191 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx)
13192 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx)
13193 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx)
13194 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx)
13195 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx)
13196 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx)
13197 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx)
13198 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx)
13199 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx)
13200 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 64(%rdx)
13201 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx)
13202 ; AVX512F-ONLY-NEXT: vzeroupper
13203 ; AVX512F-ONLY-NEXT: retq
13205 ; AVX512DQ-LABEL: mask_replication_factor8_vf32:
13206 ; AVX512DQ: # %bb.0:
13207 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
13208 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
13209 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
13210 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13211 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0
13212 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
13213 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4
13214 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
13215 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6
13216 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
13217 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8
13218 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13219 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10
13220 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13221 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12
13222 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13223 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14
13224 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13225 ; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1
13226 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16
13227 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2
13228 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3
13229 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5
13230 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7
13231 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9
13232 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm11, %zmm11
13233 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm13, %zmm13
13234 ; AVX512DQ-NEXT: vpermd %zmm16, %zmm15, %zmm15
13235 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
13236 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z}
13237 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
13238 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z}
13239 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
13240 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z}
13241 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
13242 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
13243 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
13244 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z}
13245 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
13246 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
13247 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
13248 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z}
13249 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
13250 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z}
13251 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
13252 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z}
13253 ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
13254 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z}
13255 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
13256 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
13257 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
13258 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z}
13259 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
13260 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z}
13261 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
13262 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z}
13263 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
13264 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z}
13265 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
13266 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z}
13267 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx)
13268 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx)
13269 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx)
13270 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx)
13271 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx)
13272 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx)
13273 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx)
13274 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx)
13275 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx)
13276 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx)
13277 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx)
13278 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx)
13279 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx)
13280 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx)
13281 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx)
13282 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx)
13283 ; AVX512DQ-NEXT: vzeroupper
13284 ; AVX512DQ-NEXT: retq
13286 ; AVX512BW-LABEL: mask_replication_factor8_vf32:
13287 ; AVX512BW: # %bb.0:
13288 ; AVX512BW-NEXT: kmovd (%rdi), %k0
13289 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
13290 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
13291 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13292 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3
13293 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
13294 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13295 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
13296 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
13297 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
13298 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
13299 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3
13300 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
13301 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4
13302 ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
13303 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
13304 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
13305 ; AVX512BW-NEXT: kshiftrq $32, %k4, %k4
13306 ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
13307 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
13308 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
13309 ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
13310 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
13311 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
13312 ; AVX512BW-NEXT: kshiftrq $32, %k3, %k3
13313 ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
13314 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
13315 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
13316 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
13317 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
13318 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
13319 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13320 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
13321 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
13322 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
13323 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13324 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
13325 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
13326 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13327 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13328 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
13329 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
13330 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx)
13331 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx)
13332 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx)
13333 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx)
13334 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
13335 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx)
13336 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx)
13337 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx)
13338 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
13339 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx)
13340 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
13341 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
13342 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
13343 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
13344 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
13345 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
13346 ; AVX512BW-NEXT: vzeroupper
13347 ; AVX512BW-NEXT: retq
13348 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
13349 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
13350 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
13351 %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison)
13352 store <256 x i32> %data, ptr %out.vec, align 64
13356 define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
13357 ; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64:
13358 ; AVX512F-ONLY: # %bb.0:
13359 ; AVX512F-ONLY-NEXT: subq $136, %rsp
13360 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
13361 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1
13362 ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1
13363 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
13364 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1
13365 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm10 {%k1} {z} = -1
13366 ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
13367 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13368 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0
13369 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13370 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
13371 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0
13372 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
13373 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
13374 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0
13375 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13376 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
13377 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0
13378 ; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13379 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13380 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4
13381 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13382 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5
13383 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13384 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7
13385 ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13386 ; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9
13387 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11
13388 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13
13389 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15
13390 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17
13391 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19
13392 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21
13393 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23
13394 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25
13395 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27
13396 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28
13397 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29
13398 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30
13399 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31
13400 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3
13401 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6
13402 ; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2
13403 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1
13404 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1
13405 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0
13406 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16
13407 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14
13408 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12
13409 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10
13410 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18
13411 ; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8
13412 ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1
13413 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z}
13414 ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1
13415 ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
13416 ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1
13417 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
13418 ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1
13419 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
13420 ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1
13421 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
13422 ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1
13423 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
13424 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
13425 ; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
13426 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
13427 ; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
13428 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
13429 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
13430 ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1
13431 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
13432 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1
13433 ; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
13434 ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
13435 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
13436 ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
13437 ; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
13438 ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
13439 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
13440 ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
13441 ; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
13442 ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1
13443 ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
13444 ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1
13445 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
13446 ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1
13447 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
13448 ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1
13449 ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
13450 ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1
13451 ; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
13452 ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1
13453 ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
13454 ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1
13455 ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
13456 ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1
13457 ; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
13458 ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1
13459 ; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
13460 ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1
13461 ; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
13462 ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1
13463 ; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
13464 ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1
13465 ; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
13466 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1
13467 ; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
13468 ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
13469 ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1
13470 ; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
13471 ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
13472 ; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1
13473 ; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
13474 ; AVX512F-ONLY-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
13475 ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1
13476 ; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
13477 ; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
13478 ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1
13479 ; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
13480 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
13481 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
13482 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx)
13483 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx)
13484 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx)
13485 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx)
13486 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx)
13487 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx)
13488 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx)
13489 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx)
13490 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx)
13491 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx)
13492 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
13493 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx)
13494 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx)
13495 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx)
13496 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx)
13497 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx)
13498 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx)
13499 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx)
13500 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx)
13501 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx)
13502 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx)
13503 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx)
13504 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx)
13505 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx)
13506 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx)
13507 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx)
13508 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx)
13509 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx)
13510 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx)
13511 ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx)
13512 ; AVX512F-ONLY-NEXT: addq $136, %rsp
13513 ; AVX512F-ONLY-NEXT: vzeroupper
13514 ; AVX512F-ONLY-NEXT: retq
13516 ; AVX512DQ-LABEL: mask_replication_factor8_vf64:
13517 ; AVX512DQ: # %bb.0:
13518 ; AVX512DQ-NEXT: subq $136, %rsp
13519 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0
13520 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6
13521 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0
13522 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
13523 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0
13524 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10
13525 ; AVX512DQ-NEXT: kmovw (%rdi), %k0
13526 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13527 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0
13528 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13529 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13]
13530 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0
13531 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
13532 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11]
13533 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0
13534 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13535 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9]
13536 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0
13537 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13538 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13539 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4
13540 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5]
13541 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5
13542 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
13543 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7
13544 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
13545 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9
13546 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11
13547 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13
13548 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15
13549 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17
13550 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19
13551 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21
13552 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23
13553 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25
13554 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27
13555 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28
13556 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29
13557 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30
13558 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31
13559 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3
13560 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6
13561 ; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2
13562 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8
13563 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1
13564 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0
13565 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16
13566 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14
13567 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12
13568 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10
13569 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18
13570 ; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8
13571 ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1
13572 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z}
13573 ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1
13574 ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z}
13575 ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1
13576 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z}
13577 ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1
13578 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z}
13579 ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1
13580 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z}
13581 ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1
13582 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z}
13583 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
13584 ; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z}
13585 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
13586 ; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z}
13587 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
13588 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z}
13589 ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1
13590 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z}
13591 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1
13592 ; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z}
13593 ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
13594 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
13595 ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
13596 ; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
13597 ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
13598 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z}
13599 ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
13600 ; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z}
13601 ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1
13602 ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z}
13603 ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1
13604 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z}
13605 ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1
13606 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z}
13607 ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1
13608 ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z}
13609 ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1
13610 ; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
13611 ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1
13612 ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z}
13613 ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1
13614 ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z}
13615 ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1
13616 ; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z}
13617 ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1
13618 ; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z}
13619 ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1
13620 ; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z}
13621 ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1
13622 ; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z}
13623 ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1
13624 ; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z}
13625 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1
13626 ; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z}
13627 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
13628 ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1
13629 ; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z}
13630 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
13631 ; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1
13632 ; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z}
13633 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload
13634 ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1
13635 ; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
13636 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
13637 ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1
13638 ; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
13639 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
13640 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
13641 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx)
13642 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx)
13643 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx)
13644 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx)
13645 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx)
13646 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx)
13647 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx)
13648 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx)
13649 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx)
13650 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx)
13651 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx)
13652 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx)
13653 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx)
13654 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx)
13655 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx)
13656 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx)
13657 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx)
13658 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx)
13659 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx)
13660 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx)
13661 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx)
13662 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx)
13663 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx)
13664 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx)
13665 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx)
13666 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx)
13667 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx)
13668 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx)
13669 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx)
13670 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx)
13671 ; AVX512DQ-NEXT: addq $136, %rsp
13672 ; AVX512DQ-NEXT: vzeroupper
13673 ; AVX512DQ-NEXT: retq
13675 ; AVX512BW-LABEL: mask_replication_factor8_vf64:
13676 ; AVX512BW: # %bb.0:
13677 ; AVX512BW-NEXT: kmovq (%rdi), %k0
13678 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
13679 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7]
13680 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
13681 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7
13682 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
13683 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12
13684 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
13685 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16
13686 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15
13687 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
13688 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10
13689 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5
13690 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
13691 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
13692 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
13693 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
13694 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
13695 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
13696 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
13697 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
13698 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13699 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
13700 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
13701 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z}
13702 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13703 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
13704 ; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
13705 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
13706 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13707 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
13708 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
13709 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z}
13710 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
13711 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
13712 ; AVX512BW-NEXT: vpmovb2m %zmm10, %k1
13713 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
13714 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13715 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
13716 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
13717 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
13718 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13719 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z}
13720 ; AVX512BW-NEXT: vpmovb2m %zmm15, %k2
13721 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
13722 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13723 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z}
13724 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
13725 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z}
13726 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
13727 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z}
13728 ; AVX512BW-NEXT: vpmovb2m %zmm16, %k1
13729 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z}
13730 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13731 ; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z}
13732 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
13733 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z}
13734 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13735 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
13736 ; AVX512BW-NEXT: vpmovb2m %zmm12, %k2
13737 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
13738 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13739 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z}
13740 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
13741 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
13742 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
13743 ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
13744 ; AVX512BW-NEXT: vpmovb2m %zmm7, %k1
13745 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z}
13746 ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
13747 ; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z}
13748 ; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
13749 ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z}
13750 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
13751 ; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z}
13752 ; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z}
13753 ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
13754 ; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
13755 ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
13756 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
13757 ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
13758 ; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
13759 ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx)
13760 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx)
13761 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx)
13762 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx)
13763 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx)
13764 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rdx)
13765 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx)
13766 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx)
13767 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx)
13768 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx)
13769 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx)
13770 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx)
13771 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx)
13772 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
13773 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx)
13774 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx)
13775 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx)
13776 ; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx)
13777 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx)
13778 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
13779 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
13780 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx)
13781 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx)
13782 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
13783 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
13784 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx)
13785 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx)
13786 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx)
13787 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
13788 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
13789 ; AVX512BW-NEXT: vzeroupper
13790 ; AVX512BW-NEXT: retq
13791 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64
13792 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
13793 %data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison)
13794 store <512 x i32> %data, ptr %out.vec, align 64
13798 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
13799 declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
13800 declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
13801 declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
13802 declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
13803 declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
13804 declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
13805 declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>)
13806 declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>)
13807 declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>)
13808 declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>)
13809 declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>)
13810 declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>)
13811 declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>)
13812 declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>)
13813 declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>)
13814 declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>)
13815 declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>)
13816 declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>)
13817 declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>)
13818 declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>)
13819 declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>)
13820 declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>)
13821 declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>)
13822 declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>)
13823 declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>)
13824 declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>)
13825 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
13827 ; FALLBACK0: {{.*}}
13828 ; FALLBACK1: {{.*}}
13829 ; FALLBACK2: {{.*}}
13830 ; FALLBACK3: {{.*}}
13831 ; FALLBACK4: {{.*}}
13832 ; FALLBACK5: {{.*}}
13833 ; FALLBACK6: {{.*}}
13834 ; FALLBACK7: {{.*}}