1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512,AVX512-FAST-PERLANE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-perlane-shuffle -O2 | FileCheck %s --check-prefixes=AVX512NOTDQ,AVX512NOTDQ-FAST-PERLANE
7 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
8 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
10 ; AVX512-NEXT: kmovb (%rdi), %k0
11 ; AVX512-NEXT: kshiftrb $4, %k0, %k0
12 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
13 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
14 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
15 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
16 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
19 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
20 ; AVX512NOTDQ: # %bb.0:
21 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
22 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
23 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
24 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
25 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
26 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
27 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
28 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
29 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
30 ; AVX512NOTDQ-NEXT: retq
31 %d0 = load <8 x i1>, <8 x i1>* %a0
32 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
33 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
34 store <2 x double> %d2, <2 x double>* %a3
37 define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
38 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
40 ; AVX512-NEXT: kmovb (%rdi), %k0
41 ; AVX512-NEXT: kshiftrb $6, %k0, %k0
42 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
43 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
44 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
45 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
46 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
49 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
50 ; AVX512NOTDQ: # %bb.0:
51 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
52 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
53 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
54 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
55 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
56 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
57 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
58 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
59 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
60 ; AVX512NOTDQ-NEXT: retq
61 %d0 = load <8 x i1>, <8 x i1>* %a0
62 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
63 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
64 store <2 x double> %d2, <2 x double>* %a3
67 define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
68 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
70 ; AVX512-NEXT: kmovw (%rdi), %k0
71 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
72 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
73 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
74 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
75 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
76 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
79 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
80 ; AVX512NOTDQ: # %bb.0:
81 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
82 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
83 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
84 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
85 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
86 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
87 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
88 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
89 ; AVX512NOTDQ-NEXT: retq
90 %d0 = load <16 x i1>, <16 x i1>* %a0
91 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
92 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
93 store <2 x double> %d2, <2 x double>* %a3
96 define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
97 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
99 ; AVX512-NEXT: kmovw (%rdi), %k0
100 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
101 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
102 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
103 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
104 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
105 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
108 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
109 ; AVX512NOTDQ: # %bb.0:
110 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
111 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
112 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
113 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
114 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
115 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
116 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
117 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
118 ; AVX512NOTDQ-NEXT: retq
119 %d0 = load <16 x i1>, <16 x i1>* %a0
120 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
121 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
122 store <4 x float> %d2, <4 x float>* %a3
125 define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
126 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
128 ; AVX512-NEXT: kmovw (%rdi), %k0
129 ; AVX512-NEXT: kshiftrw $14, %k0, %k0
130 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
131 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
132 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
133 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
134 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
137 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
138 ; AVX512NOTDQ: # %bb.0:
139 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
140 ; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
141 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
142 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
143 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
144 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
145 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
146 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
147 ; AVX512NOTDQ-NEXT: retq
148 %d0 = load <16 x i1>, <16 x i1>* %a0
149 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
150 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
151 store <2 x double> %d2, <2 x double>* %a3
154 define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
155 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
157 ; AVX512-NEXT: kmovw (%rdi), %k0
158 ; AVX512-NEXT: kshiftrw $12, %k0, %k0
159 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
160 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
161 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
162 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
163 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
166 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
167 ; AVX512NOTDQ: # %bb.0:
168 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
169 ; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
170 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
171 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
172 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
173 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
174 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
175 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
176 ; AVX512NOTDQ-NEXT: retq
177 %d0 = load <16 x i1>, <16 x i1>* %a0
178 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
179 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
180 store <4 x float> %d2, <4 x float>* %a3
183 define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
184 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
186 ; AVX512-NEXT: kmovd (%rdi), %k0
187 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
188 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
189 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
190 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
191 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
192 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
195 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
196 ; AVX512NOTDQ: # %bb.0:
197 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
198 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
199 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
200 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
201 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
202 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
203 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
204 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
205 ; AVX512NOTDQ-NEXT: retq
206 %d0 = load <32 x i1>, <32 x i1>* %a0
207 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
208 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
209 store <2 x double> %d2, <2 x double>* %a3
212 define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
213 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
215 ; AVX512-NEXT: kmovd (%rdi), %k0
216 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
217 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
218 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
219 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
220 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
221 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
224 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
225 ; AVX512NOTDQ: # %bb.0:
226 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
227 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
228 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
229 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
230 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
231 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
232 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
233 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
234 ; AVX512NOTDQ-NEXT: retq
235 %d0 = load <32 x i1>, <32 x i1>* %a0
236 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
237 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
238 store <4 x float> %d2, <4 x float>* %a3
241 define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
242 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
244 ; AVX512-NEXT: kmovb 2(%rdi), %k0
245 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
246 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
247 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
248 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
249 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
250 ; AVX512-NEXT: vzeroupper
253 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
254 ; AVX512NOTDQ: # %bb.0:
255 ; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1
256 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
257 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
258 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
259 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
260 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
261 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
262 ; AVX512NOTDQ-NEXT: vzeroupper
263 ; AVX512NOTDQ-NEXT: retq
264 %d0 = load <32 x i1>, <32 x i1>* %a0
265 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
266 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
267 store <8 x float> %d2, <8 x float>* %a3
270 define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
271 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
273 ; AVX512-NEXT: kmovd (%rdi), %k0
274 ; AVX512-NEXT: kshiftrd $30, %k0, %k0
275 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
276 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
277 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
278 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
279 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
282 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
283 ; AVX512NOTDQ: # %bb.0:
284 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
285 ; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
286 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
287 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
288 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
289 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
290 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
291 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
292 ; AVX512NOTDQ-NEXT: retq
293 %d0 = load <32 x i1>, <32 x i1>* %a0
294 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
295 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
296 store <2 x double> %d2, <2 x double>* %a3
299 define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
300 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
302 ; AVX512-NEXT: kmovd (%rdi), %k0
303 ; AVX512-NEXT: kshiftrd $28, %k0, %k0
304 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
305 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
306 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
307 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
308 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
311 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
312 ; AVX512NOTDQ: # %bb.0:
313 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
314 ; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
315 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
316 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
317 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
318 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
319 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
320 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
321 ; AVX512NOTDQ-NEXT: retq
322 %d0 = load <32 x i1>, <32 x i1>* %a0
323 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
324 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
325 store <4 x float> %d2, <4 x float>* %a3
328 define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
329 ; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
330 ; AVX512-FAST: # %bb.0:
331 ; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0
332 ; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2
333 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
334 ; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
335 ; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1
336 ; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1}
337 ; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi)
338 ; AVX512-FAST-NEXT: vzeroupper
339 ; AVX512-FAST-NEXT: retq
341 ; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
342 ; AVX512-FAST-PERLANE: # %bb.0:
343 ; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0
344 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2
345 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
346 ; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
347 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1
348 ; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1}
349 ; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
350 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
351 ; AVX512-FAST-PERLANE-NEXT: retq
353 ; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1:
354 ; AVX512NOTDQ-FAST: # %bb.0:
355 ; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax
356 ; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1
357 ; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
358 ; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
359 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
360 ; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
361 ; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1
362 ; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1}
363 ; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi)
364 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
365 ; AVX512NOTDQ-FAST-NEXT: retq
367 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1:
368 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
369 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax
370 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1
371 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
372 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
373 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
374 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
375 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1
376 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1}
377 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
378 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
379 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
380 %d0 = load <32 x i1>, <32 x i1>* %a0
381 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
382 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
383 store <8 x float> %d2, <8 x float>* %a3
386 define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
387 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
389 ; AVX512-NEXT: kmovq (%rdi), %k0
390 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
391 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
392 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
393 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
394 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
395 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
398 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
399 ; AVX512NOTDQ: # %bb.0:
400 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
401 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
402 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
403 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
404 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
405 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
406 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
407 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
408 ; AVX512NOTDQ-NEXT: retq
409 %d0 = load <64 x i1>, <64 x i1>* %a0
410 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
411 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
412 store <2 x double> %d2, <2 x double>* %a3
415 define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
416 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
418 ; AVX512-NEXT: kmovq (%rdi), %k0
419 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
420 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
421 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
422 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
423 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
424 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
427 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
428 ; AVX512NOTDQ: # %bb.0:
429 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
430 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
431 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
432 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
433 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
434 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
435 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
436 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
437 ; AVX512NOTDQ-NEXT: retq
438 %d0 = load <64 x i1>, <64 x i1>* %a0
439 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
440 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
441 store <4 x float> %d2, <4 x float>* %a3
444 define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
445 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
447 ; AVX512-NEXT: kmovb 4(%rdi), %k0
448 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
449 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
450 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
451 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
452 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
453 ; AVX512-NEXT: vzeroupper
456 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
457 ; AVX512NOTDQ: # %bb.0:
458 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
459 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
460 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
461 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
462 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
463 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
464 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
465 ; AVX512NOTDQ-NEXT: vzeroupper
466 ; AVX512NOTDQ-NEXT: retq
467 %d0 = load <64 x i1>, <64 x i1>* %a0
468 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
469 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
470 store <8 x float> %d2, <8 x float>* %a3
473 define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
474 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
476 ; AVX512-NEXT: kmovw 4(%rdi), %k0
477 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
478 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
479 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
480 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
481 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
482 ; AVX512-NEXT: vzeroupper
485 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
486 ; AVX512NOTDQ: # %bb.0:
487 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
488 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
489 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
490 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
491 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
492 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
493 ; AVX512NOTDQ-NEXT: vzeroupper
494 ; AVX512NOTDQ-NEXT: retq
495 %d0 = load <64 x i1>, <64 x i1>* %a0
496 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
497 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
498 store <16 x float> %d2, <16 x float>* %a3
501 define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
502 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
504 ; AVX512-NEXT: kmovq (%rdi), %k0
505 ; AVX512-NEXT: kshiftrq $62, %k0, %k0
506 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
507 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
508 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
509 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
510 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
513 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
514 ; AVX512NOTDQ: # %bb.0:
515 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
516 ; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
517 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
518 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
519 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
520 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
521 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
522 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
523 ; AVX512NOTDQ-NEXT: retq
524 %d0 = load <64 x i1>, <64 x i1>* %a0
525 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
526 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
527 store <2 x double> %d2, <2 x double>* %a3
530 define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
531 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
533 ; AVX512-NEXT: kmovq (%rdi), %k0
534 ; AVX512-NEXT: kshiftrq $60, %k0, %k0
535 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
536 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
537 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
538 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
539 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
542 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
543 ; AVX512NOTDQ: # %bb.0:
544 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
545 ; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
546 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
547 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
548 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
549 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
550 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
551 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
552 ; AVX512NOTDQ-NEXT: retq
553 %d0 = load <64 x i1>, <64 x i1>* %a0
554 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
555 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
556 store <4 x float> %d2, <4 x float>* %a3
559 define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
560 ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
561 ; AVX512-FAST: # %bb.0:
562 ; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0
563 ; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm2
564 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
565 ; AVX512-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
566 ; AVX512-FAST-NEXT: vpmovd2m %ymm2, %k1
567 ; AVX512-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1}
568 ; AVX512-FAST-NEXT: vmovaps %ymm1, (%rsi)
569 ; AVX512-FAST-NEXT: vzeroupper
570 ; AVX512-FAST-NEXT: retq
572 ; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
573 ; AVX512-FAST-PERLANE: # %bb.0:
574 ; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0
575 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm2
576 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
577 ; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
578 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm2, %k1
579 ; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1}
580 ; AVX512-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
581 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
582 ; AVX512-FAST-PERLANE-NEXT: retq
584 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1:
585 ; AVX512NOTDQ-FAST: # %bb.0:
586 ; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax
587 ; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1
588 ; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
589 ; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
590 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
591 ; AVX512NOTDQ-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2
592 ; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm2, %ymm2, %k1
593 ; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm0, %ymm1 {%k1}
594 ; AVX512NOTDQ-FAST-NEXT: vmovaps %ymm1, (%rsi)
595 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
596 ; AVX512NOTDQ-FAST-NEXT: retq
598 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1:
599 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
600 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax
601 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1
602 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
603 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
604 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
605 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
606 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm2, %ymm2, %k1
607 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 {%k1}
608 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi)
609 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
610 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
611 %d0 = load <64 x i1>, <64 x i1>* %a0
612 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
613 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
614 store <8 x float> %d2, <8 x float>* %a3
617 define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
618 ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
619 ; AVX512-FAST: # %bb.0:
620 ; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0
621 ; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm2
622 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
623 ; AVX512-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2
624 ; AVX512-FAST-NEXT: vpmovd2m %zmm2, %k1
625 ; AVX512-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1}
626 ; AVX512-FAST-NEXT: vmovaps %zmm1, (%rsi)
627 ; AVX512-FAST-NEXT: vzeroupper
628 ; AVX512-FAST-NEXT: retq
630 ; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
631 ; AVX512-FAST-PERLANE: # %bb.0:
632 ; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0
633 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm2
634 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
635 ; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
636 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm2, %k1
637 ; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1}
638 ; AVX512-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi)
639 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
640 ; AVX512-FAST-PERLANE-NEXT: retq
642 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1:
643 ; AVX512NOTDQ-FAST: # %bb.0:
644 ; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1
645 ; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
646 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
647 ; AVX512NOTDQ-FAST-NEXT: vpermd %zmm2, %zmm3, %zmm2
648 ; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm2, %zmm2, %k1
649 ; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm0, %zmm1 {%k1}
650 ; AVX512NOTDQ-FAST-NEXT: vmovaps %zmm1, (%rsi)
651 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
652 ; AVX512NOTDQ-FAST-NEXT: retq
654 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1:
655 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
656 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1
657 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
658 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
659 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[6,7,6,7,6,7,6,7]
660 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm2, %zmm2, %k1
661 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm0, %zmm1 {%k1}
662 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovaps %zmm1, (%rsi)
663 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
664 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
665 %d0 = load <64 x i1>, <64 x i1>* %a0
666 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
667 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
668 store <16 x float> %d2, <16 x float>* %a3
671 define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
672 ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
674 ; AVX512-NEXT: kmovb (%rdi), %k0
675 ; AVX512-NEXT: kshiftrb $1, %k0, %k0
676 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
677 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
678 ; AVX512-NEXT: kmovb %k0, (%rsi)
681 ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
682 ; AVX512NOTDQ: # %bb.0:
683 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
684 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
685 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
686 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
687 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
688 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
689 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
690 ; AVX512NOTDQ-NEXT: retq
691 %d0 = load <2 x i1>, <2 x i1>* %a0
692 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
693 store <1 x i1> %d1, <1 x i1>* %a1
696 define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
697 ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
699 ; AVX512-NEXT: movb (%rdi), %al
700 ; AVX512-NEXT: shrb %al
701 ; AVX512-NEXT: xorl %ecx, %ecx
702 ; AVX512-NEXT: testb $1, %al
703 ; AVX512-NEXT: movl $255, %eax
704 ; AVX512-NEXT: cmovel %ecx, %eax
705 ; AVX512-NEXT: kmovd %eax, %k0
706 ; AVX512-NEXT: kshiftrb $1, %k0, %k0
707 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
708 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
709 ; AVX512-NEXT: kmovb %k0, (%rsi)
712 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
713 ; AVX512NOTDQ: # %bb.0:
714 ; AVX512NOTDQ-NEXT: movb (%rdi), %al
715 ; AVX512NOTDQ-NEXT: shrb %al
716 ; AVX512NOTDQ-NEXT: xorl %ecx, %ecx
717 ; AVX512NOTDQ-NEXT: testb $1, %al
718 ; AVX512NOTDQ-NEXT: movl $255, %eax
719 ; AVX512NOTDQ-NEXT: cmovel %ecx, %eax
720 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
721 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
722 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
723 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
724 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
725 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
726 ; AVX512NOTDQ-NEXT: retq
727 %d0 = load <3 x i1>, <3 x i1>* %a0
728 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
729 store <1 x i1> %d1, <1 x i1>* %a1
732 define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
733 ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
735 ; AVX512-NEXT: xorl %eax, %eax
736 ; AVX512-NEXT: testb $4, (%rdi)
737 ; AVX512-NEXT: movl $255, %ecx
738 ; AVX512-NEXT: cmovel %eax, %ecx
739 ; AVX512-NEXT: kmovd %ecx, %k0
740 ; AVX512-NEXT: kshiftrb $2, %k0, %k0
741 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
742 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
743 ; AVX512-NEXT: kmovb %k0, (%rsi)
746 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
747 ; AVX512NOTDQ: # %bb.0:
748 ; AVX512NOTDQ-NEXT: xorl %eax, %eax
749 ; AVX512NOTDQ-NEXT: testb $4, (%rdi)
750 ; AVX512NOTDQ-NEXT: movl $255, %ecx
751 ; AVX512NOTDQ-NEXT: cmovel %eax, %ecx
752 ; AVX512NOTDQ-NEXT: kmovd %ecx, %k0
753 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
754 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
755 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
756 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
757 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
758 ; AVX512NOTDQ-NEXT: retq
759 %d0 = load <3 x i1>, <3 x i1>* %a0
760 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
761 store <1 x i1> %d1, <1 x i1>* %a1
764 define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
765 ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
767 ; AVX512-NEXT: kmovb (%rdi), %k0
768 ; AVX512-NEXT: kshiftrb $2, %k0, %k0
769 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
770 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
771 ; AVX512-NEXT: kmovb %k0, (%rsi)
774 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
775 ; AVX512NOTDQ: # %bb.0:
776 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
777 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
778 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
779 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
780 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
781 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
782 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
783 ; AVX512NOTDQ-NEXT: retq
784 %d0 = load <4 x i1>, <4 x i1>* %a0
785 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
786 store <1 x i1> %d1, <1 x i1>* %a1
789 define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
790 ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
792 ; AVX512-NEXT: kmovb (%rdi), %k0
793 ; AVX512-NEXT: kshiftrb $3, %k0, %k0
794 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
795 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
796 ; AVX512-NEXT: kmovb %k0, (%rsi)
799 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
800 ; AVX512NOTDQ: # %bb.0:
801 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
802 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
803 ; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0
804 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
805 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
806 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
807 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
808 ; AVX512NOTDQ-NEXT: retq
809 %d0 = load <4 x i1>, <4 x i1>* %a0
810 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
811 store <1 x i1> %d1, <1 x i1>* %a1
814 define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
815 ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
817 ; AVX512-NEXT: kmovb (%rdi), %k0
818 ; AVX512-NEXT: kshiftrb $4, %k0, %k0
819 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
820 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
821 ; AVX512-NEXT: kmovb %k0, (%rsi)
824 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
825 ; AVX512NOTDQ: # %bb.0:
826 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
827 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
828 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0
829 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
830 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
831 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
832 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
833 ; AVX512NOTDQ-NEXT: retq
834 %d0 = load <8 x i1>, <8 x i1>* %a0
835 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
836 store <1 x i1> %d1, <1 x i1>* %a1
839 define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
840 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
842 ; AVX512-NEXT: kmovb (%rdi), %k0
843 ; AVX512-NEXT: kshiftrb $4, %k0, %k0
844 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
845 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
846 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
847 ; AVX512-NEXT: kmovb %k0, (%rsi)
850 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
851 ; AVX512NOTDQ: # %bb.0:
852 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
853 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
854 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
855 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
856 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
857 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
858 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
859 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
860 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
861 ; AVX512NOTDQ-NEXT: retq
862 %d0 = load <8 x i1>, <8 x i1>* %a0
863 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
864 store <2 x i1> %d1, <2 x i1>* %a1
867 define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
868 ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
870 ; AVX512-NEXT: kmovb (%rdi), %k0
871 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
872 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
873 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
874 ; AVX512-NEXT: kmovb %k0, (%rsi)
877 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
878 ; AVX512NOTDQ: # %bb.0:
879 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
880 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
881 ; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0
882 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
883 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
884 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
885 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
886 ; AVX512NOTDQ-NEXT: retq
887 %d0 = load <8 x i1>, <8 x i1>* %a0
888 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
889 store <1 x i1> %d1, <1 x i1>* %a1
892 define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
893 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
895 ; AVX512-NEXT: kmovb (%rdi), %k0
896 ; AVX512-NEXT: kshiftrb $6, %k0, %k0
897 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
898 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
899 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
900 ; AVX512-NEXT: kmovb %k0, (%rsi)
903 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
904 ; AVX512NOTDQ: # %bb.0:
905 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
906 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
907 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
908 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
909 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
910 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
911 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
912 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
913 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
914 ; AVX512NOTDQ-NEXT: retq
915 %d0 = load <8 x i1>, <8 x i1>* %a0
916 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
917 store <2 x i1> %d1, <2 x i1>* %a1
920 define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
921 ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
923 ; AVX512-NEXT: kmovw (%rdi), %k0
924 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
925 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
926 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
927 ; AVX512-NEXT: kmovb %k0, (%rsi)
930 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
931 ; AVX512NOTDQ: # %bb.0:
932 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
933 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0
934 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
935 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
936 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
937 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
938 ; AVX512NOTDQ-NEXT: retq
939 %d0 = load <16 x i1>, <16 x i1>* %a0
940 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
941 store <1 x i1> %d1, <1 x i1>* %a1
944 define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
945 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
947 ; AVX512-NEXT: kmovw (%rdi), %k0
948 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
949 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
950 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
951 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
952 ; AVX512-NEXT: kmovb %k0, (%rsi)
955 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
956 ; AVX512NOTDQ: # %bb.0:
957 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
958 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
959 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
960 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
961 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
962 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
963 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
964 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
965 ; AVX512NOTDQ-NEXT: retq
966 %d0 = load <16 x i1>, <16 x i1>* %a0
967 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
968 store <2 x i1> %d1, <2 x i1>* %a1
971 define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
972 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
974 ; AVX512-NEXT: kmovw (%rdi), %k0
975 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
976 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
977 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
978 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
979 ; AVX512-NEXT: kmovb %k0, (%rsi)
982 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
983 ; AVX512NOTDQ: # %bb.0:
984 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
985 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
986 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
987 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
988 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
989 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
990 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
991 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
992 ; AVX512NOTDQ-NEXT: retq
993 %d0 = load <16 x i1>, <16 x i1>* %a0
994 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
995 store <4 x i1> %d1, <4 x i1>* %a1
998 define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
999 ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
1001 ; AVX512-NEXT: kmovw (%rdi), %k0
1002 ; AVX512-NEXT: kshiftrw $15, %k0, %k0
1003 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
1004 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
1005 ; AVX512-NEXT: kmovb %k0, (%rsi)
1008 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
1009 ; AVX512NOTDQ: # %bb.0:
1010 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
1011 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1012 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
1013 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1014 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1015 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1016 ; AVX512NOTDQ-NEXT: retq
1017 %d0 = load <16 x i1>, <16 x i1>* %a0
1018 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
1019 store <1 x i1> %d1, <1 x i1>* %a1
1022 define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
1023 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
1025 ; AVX512-NEXT: kmovw (%rdi), %k0
1026 ; AVX512-NEXT: kshiftrw $14, %k0, %k0
1027 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1028 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1029 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1030 ; AVX512-NEXT: kmovb %k0, (%rsi)
1033 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
1034 ; AVX512NOTDQ: # %bb.0:
1035 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
1036 ; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
1037 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1038 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1039 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1040 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1041 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1042 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1043 ; AVX512NOTDQ-NEXT: retq
1044 %d0 = load <16 x i1>, <16 x i1>* %a0
1045 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
1046 store <2 x i1> %d1, <2 x i1>* %a1
1049 define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
1050 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
1052 ; AVX512-NEXT: kmovw (%rdi), %k0
1053 ; AVX512-NEXT: kshiftrw $12, %k0, %k0
1054 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1055 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1056 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1057 ; AVX512-NEXT: kmovb %k0, (%rsi)
1060 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
1061 ; AVX512NOTDQ: # %bb.0:
1062 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
1063 ; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
1064 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1065 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1066 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1067 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1068 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1069 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1070 ; AVX512NOTDQ-NEXT: retq
1071 %d0 = load <16 x i1>, <16 x i1>* %a0
1072 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
1073 store <4 x i1> %d1, <4 x i1>* %a1
1076 define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1077 ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
1079 ; AVX512-NEXT: kmovd (%rdi), %k0
1080 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1081 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
1082 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
1083 ; AVX512-NEXT: kmovb %k0, (%rsi)
1086 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
1087 ; AVX512NOTDQ: # %bb.0:
1088 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1089 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0
1090 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
1091 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1092 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1093 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1094 ; AVX512NOTDQ-NEXT: retq
1095 %d0 = load <32 x i1>, <32 x i1>* %a0
1096 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
1097 store <1 x i1> %d1, <1 x i1>* %a1
1100 define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1101 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
1103 ; AVX512-NEXT: kmovd (%rdi), %k0
1104 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1105 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1106 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1107 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1108 ; AVX512-NEXT: kmovb %k0, (%rsi)
1111 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
1112 ; AVX512NOTDQ: # %bb.0:
1113 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1114 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1115 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1116 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1117 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1118 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1119 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1120 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1121 ; AVX512NOTDQ-NEXT: retq
1122 %d0 = load <32 x i1>, <32 x i1>* %a0
1123 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
1124 store <2 x i1> %d1, <2 x i1>* %a1
1127 define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1128 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
1130 ; AVX512-NEXT: kmovd (%rdi), %k0
1131 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1132 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1133 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1134 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1135 ; AVX512-NEXT: kmovb %k0, (%rsi)
1138 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
1139 ; AVX512NOTDQ: # %bb.0:
1140 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1141 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1142 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1143 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1144 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1145 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1146 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1147 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1148 ; AVX512NOTDQ-NEXT: retq
1149 %d0 = load <32 x i1>, <32 x i1>* %a0
1150 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
1151 store <4 x i1> %d1, <4 x i1>* %a1
1154 define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1155 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
1157 ; AVX512-NEXT: kmovb 2(%rdi), %k0
1158 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1159 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1160 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1161 ; AVX512-NEXT: kmovb %k0, (%rsi)
1162 ; AVX512-NEXT: vzeroupper
1165 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
1166 ; AVX512NOTDQ: # %bb.0:
1167 ; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1
1168 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1169 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1170 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1171 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1172 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1173 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1174 ; AVX512NOTDQ-NEXT: vzeroupper
1175 ; AVX512NOTDQ-NEXT: retq
1176 %d0 = load <32 x i1>, <32 x i1>* %a0
1177 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
1178 store <8 x i1> %d1, <8 x i1>* %a1
1181 define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1182 ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
1184 ; AVX512-NEXT: kmovd (%rdi), %k0
1185 ; AVX512-NEXT: kshiftrd $31, %k0, %k0
1186 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
1187 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
1188 ; AVX512-NEXT: kmovb %k0, (%rsi)
1191 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
1192 ; AVX512NOTDQ: # %bb.0:
1193 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1194 ; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0
1195 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
1196 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1197 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1198 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1199 ; AVX512NOTDQ-NEXT: retq
1200 %d0 = load <32 x i1>, <32 x i1>* %a0
1201 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
1202 store <1 x i1> %d1, <1 x i1>* %a1
1205 define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1206 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1208 ; AVX512-NEXT: kmovd (%rdi), %k0
1209 ; AVX512-NEXT: kshiftrd $30, %k0, %k0
1210 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1211 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1212 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1213 ; AVX512-NEXT: kmovb %k0, (%rsi)
1216 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1217 ; AVX512NOTDQ: # %bb.0:
1218 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1219 ; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
1220 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1221 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1222 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1223 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1224 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1225 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1226 ; AVX512NOTDQ-NEXT: retq
1227 %d0 = load <32 x i1>, <32 x i1>* %a0
1228 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1229 store <2 x i1> %d1, <2 x i1>* %a1
1232 define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1233 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1235 ; AVX512-NEXT: kmovd (%rdi), %k0
1236 ; AVX512-NEXT: kshiftrd $28, %k0, %k0
1237 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1238 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1239 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1240 ; AVX512-NEXT: kmovb %k0, (%rsi)
1243 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1244 ; AVX512NOTDQ: # %bb.0:
1245 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1246 ; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
1247 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1248 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1249 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1250 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1251 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1252 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1253 ; AVX512NOTDQ-NEXT: retq
1254 %d0 = load <32 x i1>, <32 x i1>* %a0
1255 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1256 store <4 x i1> %d1, <4 x i1>* %a1
1259 define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1260 ; AVX512-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
1261 ; AVX512-FAST: # %bb.0:
1262 ; AVX512-FAST-NEXT: kmovb 3(%rdi), %k0
1263 ; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0
1264 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1265 ; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1266 ; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0
1267 ; AVX512-FAST-NEXT: kmovb %k0, (%rsi)
1268 ; AVX512-FAST-NEXT: vzeroupper
1269 ; AVX512-FAST-NEXT: retq
1271 ; AVX512-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
1272 ; AVX512-FAST-PERLANE: # %bb.0:
1273 ; AVX512-FAST-PERLANE-NEXT: kmovb 3(%rdi), %k0
1274 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
1275 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1276 ; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1277 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
1278 ; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi)
1279 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
1280 ; AVX512-FAST-PERLANE-NEXT: retq
1282 ; AVX512NOTDQ-FAST-LABEL: load_v32i1_broadcast_31_v8i1_store:
1283 ; AVX512NOTDQ-FAST: # %bb.0:
1284 ; AVX512NOTDQ-FAST-NEXT: movzbl 3(%rdi), %eax
1285 ; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1
1286 ; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1287 ; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1288 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1289 ; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1290 ; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0
1291 ; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax
1292 ; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi)
1293 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
1294 ; AVX512NOTDQ-FAST-NEXT: retq
1296 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v32i1_broadcast_31_v8i1_store:
1297 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
1298 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 3(%rdi), %eax
1299 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1
1300 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1301 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1302 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1303 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1304 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0
1305 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax
1306 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi)
1307 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
1308 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
1309 %d0 = load <32 x i1>, <32 x i1>* %a0
1310 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1311 store <8 x i1> %d1, <8 x i1>* %a1
1314 define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1315 ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1317 ; AVX512-NEXT: kmovq (%rdi), %k0
1318 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1319 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
1320 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
1321 ; AVX512-NEXT: kmovb %k0, (%rsi)
1324 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1325 ; AVX512NOTDQ: # %bb.0:
1326 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1327 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0
1328 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
1329 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1330 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1331 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1332 ; AVX512NOTDQ-NEXT: retq
1333 %d0 = load <64 x i1>, <64 x i1>* %a0
1334 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1335 store <1 x i1> %d1, <1 x i1>* %a1
1338 define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1339 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1341 ; AVX512-NEXT: kmovq (%rdi), %k0
1342 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1343 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1344 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1345 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1346 ; AVX512-NEXT: kmovb %k0, (%rsi)
1349 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1350 ; AVX512NOTDQ: # %bb.0:
1351 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1352 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1353 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1354 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1355 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1356 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1357 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1358 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1359 ; AVX512NOTDQ-NEXT: retq
1360 %d0 = load <64 x i1>, <64 x i1>* %a0
1361 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1362 store <2 x i1> %d1, <2 x i1>* %a1
1365 define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1366 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1368 ; AVX512-NEXT: kmovq (%rdi), %k0
1369 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1370 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1371 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1372 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1373 ; AVX512-NEXT: kmovb %k0, (%rsi)
1376 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1377 ; AVX512NOTDQ: # %bb.0:
1378 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1379 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1380 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1381 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1382 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1383 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1384 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1385 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1386 ; AVX512NOTDQ-NEXT: retq
1387 %d0 = load <64 x i1>, <64 x i1>* %a0
1388 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1389 store <4 x i1> %d1, <4 x i1>* %a1
1392 define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1393 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1395 ; AVX512-NEXT: kmovb 4(%rdi), %k0
1396 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1397 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1398 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1399 ; AVX512-NEXT: kmovb %k0, (%rsi)
1400 ; AVX512-NEXT: vzeroupper
1403 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1404 ; AVX512NOTDQ: # %bb.0:
1405 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
1406 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1407 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1408 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1409 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1410 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1411 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1412 ; AVX512NOTDQ-NEXT: vzeroupper
1413 ; AVX512NOTDQ-NEXT: retq
1414 %d0 = load <64 x i1>, <64 x i1>* %a0
1415 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1416 store <8 x i1> %d1, <8 x i1>* %a1
1419 define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1420 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1422 ; AVX512-NEXT: kmovw 4(%rdi), %k0
1423 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1424 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
1425 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1426 ; AVX512-NEXT: kmovw %k0, (%rsi)
1427 ; AVX512-NEXT: vzeroupper
1430 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1431 ; AVX512NOTDQ: # %bb.0:
1432 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
1433 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1434 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
1435 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1436 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1437 ; AVX512NOTDQ-NEXT: vzeroupper
1438 ; AVX512NOTDQ-NEXT: retq
1439 %d0 = load <64 x i1>, <64 x i1>* %a0
1440 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1441 store <16 x i1> %d1, <16 x i1>* %a1
1444 define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1445 ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1447 ; AVX512-NEXT: kmovq (%rdi), %k0
1448 ; AVX512-NEXT: kshiftrq $63, %k0, %k0
1449 ; AVX512-NEXT: kshiftlb $7, %k0, %k0
1450 ; AVX512-NEXT: kshiftrb $7, %k0, %k0
1451 ; AVX512-NEXT: kmovb %k0, (%rsi)
1454 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1455 ; AVX512NOTDQ: # %bb.0:
1456 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1457 ; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0
1458 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0
1459 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
1460 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1461 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1462 ; AVX512NOTDQ-NEXT: retq
1463 %d0 = load <64 x i1>, <64 x i1>* %a0
1464 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1465 store <1 x i1> %d1, <1 x i1>* %a1
1468 define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1469 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1471 ; AVX512-NEXT: kmovq (%rdi), %k0
1472 ; AVX512-NEXT: kshiftrq $62, %k0, %k0
1473 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1474 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1475 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1476 ; AVX512-NEXT: kmovb %k0, (%rsi)
1479 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1480 ; AVX512NOTDQ: # %bb.0:
1481 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1482 ; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
1483 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1484 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1485 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1486 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1487 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1488 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1489 ; AVX512NOTDQ-NEXT: retq
1490 %d0 = load <64 x i1>, <64 x i1>* %a0
1491 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1492 store <2 x i1> %d1, <2 x i1>* %a1
1495 define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1496 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1498 ; AVX512-NEXT: kmovq (%rdi), %k0
1499 ; AVX512-NEXT: kshiftrq $60, %k0, %k0
1500 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1501 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1502 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1503 ; AVX512-NEXT: kmovb %k0, (%rsi)
1506 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1507 ; AVX512NOTDQ: # %bb.0:
1508 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1509 ; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
1510 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1511 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1512 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1513 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1514 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1515 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1516 ; AVX512NOTDQ-NEXT: retq
1517 %d0 = load <64 x i1>, <64 x i1>* %a0
1518 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1519 store <4 x i1> %d1, <4 x i1>* %a1
1522 define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1523 ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
1524 ; AVX512-FAST: # %bb.0:
1525 ; AVX512-FAST-NEXT: kmovb 7(%rdi), %k0
1526 ; AVX512-FAST-NEXT: vpmovm2d %k0, %ymm0
1527 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1528 ; AVX512-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1529 ; AVX512-FAST-NEXT: vpmovd2m %ymm0, %k0
1530 ; AVX512-FAST-NEXT: kmovb %k0, (%rsi)
1531 ; AVX512-FAST-NEXT: vzeroupper
1532 ; AVX512-FAST-NEXT: retq
1534 ; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
1535 ; AVX512-FAST-PERLANE: # %bb.0:
1536 ; AVX512-FAST-PERLANE-NEXT: kmovb 7(%rdi), %k0
1537 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0
1538 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1539 ; AVX512-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1540 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0
1541 ; AVX512-FAST-PERLANE-NEXT: kmovb %k0, (%rsi)
1542 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
1543 ; AVX512-FAST-PERLANE-NEXT: retq
1545 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v8i1_store:
1546 ; AVX512NOTDQ-FAST: # %bb.0:
1547 ; AVX512NOTDQ-FAST-NEXT: movzbl 7(%rdi), %eax
1548 ; AVX512NOTDQ-FAST-NEXT: kmovd %eax, %k1
1549 ; AVX512NOTDQ-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1550 ; AVX512NOTDQ-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1551 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1552 ; AVX512NOTDQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
1553 ; AVX512NOTDQ-FAST-NEXT: vptestmd %ymm0, %ymm0, %k0
1554 ; AVX512NOTDQ-FAST-NEXT: kmovd %k0, %eax
1555 ; AVX512NOTDQ-FAST-NEXT: movb %al, (%rsi)
1556 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
1557 ; AVX512NOTDQ-FAST-NEXT: retq
1559 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v8i1_store:
1560 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
1561 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movzbl 7(%rdi), %eax
1562 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %eax, %k1
1563 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1564 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1565 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
1566 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1567 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %ymm0, %ymm0, %k0
1568 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovd %k0, %eax
1569 ; AVX512NOTDQ-FAST-PERLANE-NEXT: movb %al, (%rsi)
1570 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
1571 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
1572 %d0 = load <64 x i1>, <64 x i1>* %a0
1573 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1574 store <8 x i1> %d1, <8 x i1>* %a1
1577 define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1578 ; AVX512-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
1579 ; AVX512-FAST: # %bb.0:
1580 ; AVX512-FAST-NEXT: kmovw 6(%rdi), %k0
1581 ; AVX512-FAST-NEXT: vpmovm2d %k0, %zmm0
1582 ; AVX512-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1583 ; AVX512-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
1584 ; AVX512-FAST-NEXT: vpmovd2m %zmm0, %k0
1585 ; AVX512-FAST-NEXT: kmovw %k0, (%rsi)
1586 ; AVX512-FAST-NEXT: vzeroupper
1587 ; AVX512-FAST-NEXT: retq
1589 ; AVX512-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
1590 ; AVX512-FAST-PERLANE: # %bb.0:
1591 ; AVX512-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k0
1592 ; AVX512-FAST-PERLANE-NEXT: vpmovm2d %k0, %zmm0
1593 ; AVX512-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
1594 ; AVX512-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
1595 ; AVX512-FAST-PERLANE-NEXT: vpmovd2m %zmm0, %k0
1596 ; AVX512-FAST-PERLANE-NEXT: kmovw %k0, (%rsi)
1597 ; AVX512-FAST-PERLANE-NEXT: vzeroupper
1598 ; AVX512-FAST-PERLANE-NEXT: retq
1600 ; AVX512NOTDQ-FAST-LABEL: load_v64i1_broadcast_63_v16i1_store:
1601 ; AVX512NOTDQ-FAST: # %bb.0:
1602 ; AVX512NOTDQ-FAST-NEXT: kmovw 6(%rdi), %k1
1603 ; AVX512NOTDQ-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1604 ; AVX512NOTDQ-FAST-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1605 ; AVX512NOTDQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
1606 ; AVX512NOTDQ-FAST-NEXT: vptestmd %zmm0, %zmm0, %k0
1607 ; AVX512NOTDQ-FAST-NEXT: kmovw %k0, (%rsi)
1608 ; AVX512NOTDQ-FAST-NEXT: vzeroupper
1609 ; AVX512NOTDQ-FAST-NEXT: retq
1611 ; AVX512NOTDQ-FAST-PERLANE-LABEL: load_v64i1_broadcast_63_v16i1_store:
1612 ; AVX512NOTDQ-FAST-PERLANE: # %bb.0:
1613 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw 6(%rdi), %k1
1614 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1615 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
1616 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
1617 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vptestmd %zmm0, %zmm0, %k0
1618 ; AVX512NOTDQ-FAST-PERLANE-NEXT: kmovw %k0, (%rsi)
1619 ; AVX512NOTDQ-FAST-PERLANE-NEXT: vzeroupper
1620 ; AVX512NOTDQ-FAST-PERLANE-NEXT: retq
1621 %d0 = load <64 x i1>, <64 x i1>* %a0
1622 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1623 store <16 x i1> %d1, <16 x i1>* %a1