1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
5 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
6 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
8 ; AVX512-NEXT: kmovb 4(%rdi), %k0
9 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
10 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
11 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
12 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
13 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
16 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
17 ; AVX512NOTDQ: # %bb.0:
18 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
19 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
20 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
21 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
22 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
23 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
24 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
25 ; AVX512NOTDQ-NEXT: retq
26 %d0 = load <8 x i1>, <8 x i1>* %a0
27 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
28 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
29 store <2 x double> %d2, <2 x double>* %a3
32 define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
33 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
35 ; AVX512-NEXT: kmovb 6(%rdi), %k0
36 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
37 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
38 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
39 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
40 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
43 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
44 ; AVX512NOTDQ: # %bb.0:
45 ; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1
46 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
47 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
48 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
49 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
50 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
51 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
52 ; AVX512NOTDQ-NEXT: retq
53 %d0 = load <8 x i1>, <8 x i1>* %a0
54 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
55 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
56 store <2 x double> %d2, <2 x double>* %a3
59 define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
60 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
62 ; AVX512-NEXT: kmovb 8(%rdi), %k0
63 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
64 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
65 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
66 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
67 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
70 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
71 ; AVX512NOTDQ: # %bb.0:
72 ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1
73 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
74 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
75 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
76 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
77 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
78 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
79 ; AVX512NOTDQ-NEXT: retq
80 %d0 = load <16 x i1>, <16 x i1>* %a0
81 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
82 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
83 store <2 x double> %d2, <2 x double>* %a3
86 define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
87 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
89 ; AVX512-NEXT: kmovb 8(%rdi), %k0
90 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
91 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
92 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
93 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
94 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
97 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
98 ; AVX512NOTDQ: # %bb.0:
99 ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1
100 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
101 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
102 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
103 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
104 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
105 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
106 ; AVX512NOTDQ-NEXT: retq
107 %d0 = load <16 x i1>, <16 x i1>* %a0
108 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
109 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
110 store <4 x float> %d2, <4 x float>* %a3
113 define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
114 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
116 ; AVX512-NEXT: kmovb 14(%rdi), %k0
117 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
119 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
120 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
121 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
124 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
125 ; AVX512NOTDQ: # %bb.0:
126 ; AVX512NOTDQ-NEXT: kmovw 14(%rdi), %k1
127 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
128 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
129 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
130 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
131 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
132 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
133 ; AVX512NOTDQ-NEXT: retq
134 %d0 = load <16 x i1>, <16 x i1>* %a0
135 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
136 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
137 store <2 x double> %d2, <2 x double>* %a3
140 define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
141 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
143 ; AVX512-NEXT: kmovb 12(%rdi), %k0
144 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
145 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
146 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
147 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
148 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
151 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
152 ; AVX512NOTDQ: # %bb.0:
153 ; AVX512NOTDQ-NEXT: kmovw 12(%rdi), %k1
154 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
155 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
156 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
157 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
158 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
159 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
160 ; AVX512NOTDQ-NEXT: retq
161 %d0 = load <16 x i1>, <16 x i1>* %a0
162 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
163 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
164 store <4 x float> %d2, <4 x float>* %a3
167 define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
168 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
170 ; AVX512-NEXT: kmovb 16(%rdi), %k0
171 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
172 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
173 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
174 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
175 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
178 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
179 ; AVX512NOTDQ: # %bb.0:
180 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
181 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
182 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
183 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
184 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
185 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
186 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
187 ; AVX512NOTDQ-NEXT: retq
188 %d0 = load <32 x i1>, <32 x i1>* %a0
189 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
190 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
191 store <2 x double> %d2, <2 x double>* %a3
194 define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
195 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
197 ; AVX512-NEXT: kmovb 16(%rdi), %k0
198 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
199 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
200 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
201 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
202 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
205 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
206 ; AVX512NOTDQ: # %bb.0:
207 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
208 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
209 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
210 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
211 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
212 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
213 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
214 ; AVX512NOTDQ-NEXT: retq
215 %d0 = load <32 x i1>, <32 x i1>* %a0
216 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
217 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
218 store <4 x float> %d2, <4 x float>* %a3
221 define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
222 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
224 ; AVX512-NEXT: kmovb 16(%rdi), %k0
225 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
226 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
227 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
228 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
229 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
230 ; AVX512-NEXT: vzeroupper
233 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
234 ; AVX512NOTDQ: # %bb.0:
235 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
236 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
237 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
238 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
239 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
240 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
241 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
242 ; AVX512NOTDQ-NEXT: vzeroupper
243 ; AVX512NOTDQ-NEXT: retq
244 %d0 = load <32 x i1>, <32 x i1>* %a0
245 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
246 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
247 store <8 x float> %d2, <8 x float>* %a3
250 define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
251 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
253 ; AVX512-NEXT: kmovb 30(%rdi), %k0
254 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
255 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
256 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
257 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
258 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
261 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
262 ; AVX512NOTDQ: # %bb.0:
263 ; AVX512NOTDQ-NEXT: kmovw 30(%rdi), %k1
264 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
265 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
266 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
267 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
268 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
269 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
270 ; AVX512NOTDQ-NEXT: retq
271 %d0 = load <32 x i1>, <32 x i1>* %a0
272 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
273 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
274 store <2 x double> %d2, <2 x double>* %a3
277 define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
278 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
280 ; AVX512-NEXT: kmovb 28(%rdi), %k0
281 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
282 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
283 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
284 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
285 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
288 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
289 ; AVX512NOTDQ: # %bb.0:
290 ; AVX512NOTDQ-NEXT: kmovw 28(%rdi), %k1
291 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
292 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
293 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
294 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
295 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
296 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
297 ; AVX512NOTDQ-NEXT: retq
298 %d0 = load <32 x i1>, <32 x i1>* %a0
299 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
300 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
301 store <4 x float> %d2, <4 x float>* %a3
304 define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
305 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
307 ; AVX512-NEXT: kmovb 24(%rdi), %k0
308 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
309 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
310 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
311 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
312 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
313 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
314 ; AVX512-NEXT: vzeroupper
317 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
318 ; AVX512NOTDQ: # %bb.0:
319 ; AVX512NOTDQ-NEXT: kmovw 24(%rdi), %k1
320 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
321 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
322 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
323 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
324 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
325 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
326 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
327 ; AVX512NOTDQ-NEXT: vzeroupper
328 ; AVX512NOTDQ-NEXT: retq
329 %d0 = load <32 x i1>, <32 x i1>* %a0
330 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
331 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
332 store <8 x float> %d2, <8 x float>* %a3
335 define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
336 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
338 ; AVX512-NEXT: kmovb 32(%rdi), %k0
339 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
340 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
341 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
342 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
343 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
346 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
347 ; AVX512NOTDQ: # %bb.0:
348 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
349 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
350 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
351 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
352 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
353 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
354 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
355 ; AVX512NOTDQ-NEXT: retq
356 %d0 = load <64 x i1>, <64 x i1>* %a0
357 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
358 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
359 store <2 x double> %d2, <2 x double>* %a3
362 define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
363 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
365 ; AVX512-NEXT: kmovb 32(%rdi), %k0
366 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
367 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
368 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
369 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
370 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
373 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
374 ; AVX512NOTDQ: # %bb.0:
375 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
376 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
377 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
378 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
379 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
380 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
381 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
382 ; AVX512NOTDQ-NEXT: retq
383 %d0 = load <64 x i1>, <64 x i1>* %a0
384 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
385 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
386 store <4 x float> %d2, <4 x float>* %a3
389 define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
390 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
392 ; AVX512-NEXT: kmovb 32(%rdi), %k0
393 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
394 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
395 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
396 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
397 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
398 ; AVX512-NEXT: vzeroupper
401 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
402 ; AVX512NOTDQ: # %bb.0:
403 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
404 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
405 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
406 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
407 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
408 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
409 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
410 ; AVX512NOTDQ-NEXT: vzeroupper
411 ; AVX512NOTDQ-NEXT: retq
412 %d0 = load <64 x i1>, <64 x i1>* %a0
413 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
414 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
415 store <8 x float> %d2, <8 x float>* %a3
418 define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
419 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
421 ; AVX512-NEXT: kmovw 32(%rdi), %k0
422 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
423 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
424 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
425 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
426 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
427 ; AVX512-NEXT: vzeroupper
430 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
431 ; AVX512NOTDQ: # %bb.0:
432 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
433 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
434 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
435 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
436 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
437 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
438 ; AVX512NOTDQ-NEXT: vzeroupper
439 ; AVX512NOTDQ-NEXT: retq
440 %d0 = load <64 x i1>, <64 x i1>* %a0
441 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
442 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
443 store <16 x float> %d2, <16 x float>* %a3
446 define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
447 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
449 ; AVX512-NEXT: kmovb 62(%rdi), %k0
450 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
451 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
452 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
453 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
454 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
457 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
458 ; AVX512NOTDQ: # %bb.0:
459 ; AVX512NOTDQ-NEXT: kmovw 62(%rdi), %k1
460 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
461 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
462 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
463 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
464 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
465 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
466 ; AVX512NOTDQ-NEXT: retq
467 %d0 = load <64 x i1>, <64 x i1>* %a0
468 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
469 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
470 store <2 x double> %d2, <2 x double>* %a3
473 define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
474 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
476 ; AVX512-NEXT: kmovb 60(%rdi), %k0
477 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
478 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
479 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
480 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
481 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
484 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
485 ; AVX512NOTDQ: # %bb.0:
486 ; AVX512NOTDQ-NEXT: kmovw 60(%rdi), %k1
487 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
488 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
489 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
490 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
491 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
492 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
493 ; AVX512NOTDQ-NEXT: retq
494 %d0 = load <64 x i1>, <64 x i1>* %a0
495 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
496 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
497 store <4 x float> %d2, <4 x float>* %a3
500 define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
501 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
503 ; AVX512-NEXT: kmovb 56(%rdi), %k0
504 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
505 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
506 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
507 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
508 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
509 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
510 ; AVX512-NEXT: vzeroupper
513 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
514 ; AVX512NOTDQ: # %bb.0:
515 ; AVX512NOTDQ-NEXT: kmovw 56(%rdi), %k1
516 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
517 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
518 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
519 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
520 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
521 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
522 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
523 ; AVX512NOTDQ-NEXT: vzeroupper
524 ; AVX512NOTDQ-NEXT: retq
525 %d0 = load <64 x i1>, <64 x i1>* %a0
526 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
527 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
528 store <8 x float> %d2, <8 x float>* %a3
531 define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
532 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
534 ; AVX512-NEXT: kmovw 48(%rdi), %k0
535 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
536 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
537 ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
538 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
539 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
540 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
541 ; AVX512-NEXT: vzeroupper
544 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
545 ; AVX512NOTDQ: # %bb.0:
546 ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1
547 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
548 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
549 ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
550 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
551 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
552 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
553 ; AVX512NOTDQ-NEXT: vzeroupper
554 ; AVX512NOTDQ-NEXT: retq
555 %d0 = load <64 x i1>, <64 x i1>* %a0
556 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
557 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
558 store <16 x float> %d2, <16 x float>* %a3
561 define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
562 ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
564 ; AVX512-NEXT: kmovb 1(%rdi), %k0
565 ; AVX512-NEXT: kmovb %k0, (%rsi)
568 ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
569 ; AVX512NOTDQ: # %bb.0:
570 ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al
571 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
572 ; AVX512NOTDQ-NEXT: retq
573 %d0 = load <2 x i1>, <2 x i1>* %a0
574 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
575 store <1 x i1> %d1, <1 x i1>* %a1
578 define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
579 ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
581 ; AVX512-NEXT: kmovb 1(%rdi), %k0
582 ; AVX512-NEXT: kmovb %k0, (%rsi)
585 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
586 ; AVX512NOTDQ: # %bb.0:
587 ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al
588 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
589 ; AVX512NOTDQ-NEXT: retq
590 %d0 = load <3 x i1>, <3 x i1>* %a0
591 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
592 store <1 x i1> %d1, <1 x i1>* %a1
595 define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
596 ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
598 ; AVX512-NEXT: kmovb 2(%rdi), %k0
599 ; AVX512-NEXT: kmovb %k0, (%rsi)
602 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
603 ; AVX512NOTDQ: # %bb.0:
604 ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al
605 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
606 ; AVX512NOTDQ-NEXT: retq
607 %d0 = load <3 x i1>, <3 x i1>* %a0
608 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
609 store <1 x i1> %d1, <1 x i1>* %a1
612 define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
613 ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
615 ; AVX512-NEXT: kmovb 2(%rdi), %k0
616 ; AVX512-NEXT: kmovb %k0, (%rsi)
619 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
620 ; AVX512NOTDQ: # %bb.0:
621 ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al
622 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
623 ; AVX512NOTDQ-NEXT: retq
624 %d0 = load <4 x i1>, <4 x i1>* %a0
625 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
626 store <1 x i1> %d1, <1 x i1>* %a1
629 define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
630 ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
632 ; AVX512-NEXT: kmovb 3(%rdi), %k0
633 ; AVX512-NEXT: kmovb %k0, (%rsi)
636 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
637 ; AVX512NOTDQ: # %bb.0:
638 ; AVX512NOTDQ-NEXT: movb 3(%rdi), %al
639 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
640 ; AVX512NOTDQ-NEXT: retq
641 %d0 = load <4 x i1>, <4 x i1>* %a0
642 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
643 store <1 x i1> %d1, <1 x i1>* %a1
646 define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
647 ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
649 ; AVX512-NEXT: kmovb 4(%rdi), %k0
650 ; AVX512-NEXT: kmovb %k0, (%rsi)
653 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
654 ; AVX512NOTDQ: # %bb.0:
655 ; AVX512NOTDQ-NEXT: movb 4(%rdi), %al
656 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
657 ; AVX512NOTDQ-NEXT: retq
658 %d0 = load <8 x i1>, <8 x i1>* %a0
659 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
660 store <1 x i1> %d1, <1 x i1>* %a1
663 define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
664 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
666 ; AVX512-NEXT: kmovb 4(%rdi), %k0
667 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
668 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
669 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
670 ; AVX512-NEXT: kmovb %k0, (%rsi)
673 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
674 ; AVX512NOTDQ: # %bb.0:
675 ; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1
676 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
677 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
678 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
679 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
680 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
681 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
682 ; AVX512NOTDQ-NEXT: retq
683 %d0 = load <8 x i1>, <8 x i1>* %a0
684 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
685 store <2 x i1> %d1, <2 x i1>* %a1
688 define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
689 ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
691 ; AVX512-NEXT: kmovb 7(%rdi), %k0
692 ; AVX512-NEXT: kmovb %k0, (%rsi)
695 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
696 ; AVX512NOTDQ: # %bb.0:
697 ; AVX512NOTDQ-NEXT: movb 7(%rdi), %al
698 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
699 ; AVX512NOTDQ-NEXT: retq
700 %d0 = load <8 x i1>, <8 x i1>* %a0
701 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
702 store <1 x i1> %d1, <1 x i1>* %a1
705 define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
706 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
708 ; AVX512-NEXT: kmovb 6(%rdi), %k0
709 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
710 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
711 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
712 ; AVX512-NEXT: kmovb %k0, (%rsi)
715 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
716 ; AVX512NOTDQ: # %bb.0:
717 ; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1
718 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
719 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
720 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
721 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
722 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
723 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
724 ; AVX512NOTDQ-NEXT: retq
725 %d0 = load <8 x i1>, <8 x i1>* %a0
726 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
727 store <2 x i1> %d1, <2 x i1>* %a1
730 define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
731 ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
733 ; AVX512-NEXT: kmovb 8(%rdi), %k0
734 ; AVX512-NEXT: kmovb %k0, (%rsi)
737 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
738 ; AVX512NOTDQ: # %bb.0:
739 ; AVX512NOTDQ-NEXT: movb 8(%rdi), %al
740 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
741 ; AVX512NOTDQ-NEXT: retq
742 %d0 = load <16 x i1>, <16 x i1>* %a0
743 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
744 store <1 x i1> %d1, <1 x i1>* %a1
747 define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
748 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
750 ; AVX512-NEXT: kmovb 8(%rdi), %k0
751 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
752 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
753 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
754 ; AVX512-NEXT: kmovb %k0, (%rsi)
757 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
758 ; AVX512NOTDQ: # %bb.0:
759 ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1
760 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
761 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
762 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
763 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
764 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
765 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
766 ; AVX512NOTDQ-NEXT: retq
767 %d0 = load <16 x i1>, <16 x i1>* %a0
768 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
769 store <2 x i1> %d1, <2 x i1>* %a1
772 define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
773 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
775 ; AVX512-NEXT: kmovb 8(%rdi), %k0
776 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
777 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
778 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
779 ; AVX512-NEXT: kmovb %k0, (%rsi)
782 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
783 ; AVX512NOTDQ: # %bb.0:
784 ; AVX512NOTDQ-NEXT: kmovw 8(%rdi), %k1
785 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
786 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
787 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
788 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
789 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
790 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
791 ; AVX512NOTDQ-NEXT: retq
792 %d0 = load <16 x i1>, <16 x i1>* %a0
793 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
794 store <4 x i1> %d1, <4 x i1>* %a1
797 define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
798 ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
800 ; AVX512-NEXT: kmovb 15(%rdi), %k0
801 ; AVX512-NEXT: kmovb %k0, (%rsi)
804 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
805 ; AVX512NOTDQ: # %bb.0:
806 ; AVX512NOTDQ-NEXT: movb 15(%rdi), %al
807 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
808 ; AVX512NOTDQ-NEXT: retq
809 %d0 = load <16 x i1>, <16 x i1>* %a0
810 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
811 store <1 x i1> %d1, <1 x i1>* %a1
814 define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
815 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
817 ; AVX512-NEXT: kmovb 14(%rdi), %k0
818 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
819 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
820 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
821 ; AVX512-NEXT: kmovb %k0, (%rsi)
824 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
825 ; AVX512NOTDQ: # %bb.0:
826 ; AVX512NOTDQ-NEXT: kmovw 14(%rdi), %k1
827 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
828 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
829 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
830 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
831 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
832 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
833 ; AVX512NOTDQ-NEXT: retq
834 %d0 = load <16 x i1>, <16 x i1>* %a0
835 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
836 store <2 x i1> %d1, <2 x i1>* %a1
839 define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
840 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
842 ; AVX512-NEXT: kmovb 12(%rdi), %k0
843 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
844 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
845 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
846 ; AVX512-NEXT: kmovb %k0, (%rsi)
849 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
850 ; AVX512NOTDQ: # %bb.0:
851 ; AVX512NOTDQ-NEXT: kmovw 12(%rdi), %k1
852 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
853 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
854 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
855 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
856 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
857 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
858 ; AVX512NOTDQ-NEXT: retq
859 %d0 = load <16 x i1>, <16 x i1>* %a0
860 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
861 store <4 x i1> %d1, <4 x i1>* %a1
864 define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
865 ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
867 ; AVX512-NEXT: kmovb 16(%rdi), %k0
868 ; AVX512-NEXT: kmovb %k0, (%rsi)
871 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
872 ; AVX512NOTDQ: # %bb.0:
873 ; AVX512NOTDQ-NEXT: movb 16(%rdi), %al
874 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
875 ; AVX512NOTDQ-NEXT: retq
876 %d0 = load <32 x i1>, <32 x i1>* %a0
877 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
878 store <1 x i1> %d1, <1 x i1>* %a1
881 define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
882 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
884 ; AVX512-NEXT: kmovb 16(%rdi), %k0
885 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
886 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
887 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
888 ; AVX512-NEXT: kmovb %k0, (%rsi)
891 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
892 ; AVX512NOTDQ: # %bb.0:
893 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
894 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
895 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
896 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
897 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
898 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
899 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
900 ; AVX512NOTDQ-NEXT: retq
901 %d0 = load <32 x i1>, <32 x i1>* %a0
902 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
903 store <2 x i1> %d1, <2 x i1>* %a1
906 define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
907 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
909 ; AVX512-NEXT: kmovb 16(%rdi), %k0
910 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
911 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
912 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
913 ; AVX512-NEXT: kmovb %k0, (%rsi)
916 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
917 ; AVX512NOTDQ: # %bb.0:
918 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
919 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
920 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
921 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
922 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
923 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
924 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
925 ; AVX512NOTDQ-NEXT: retq
926 %d0 = load <32 x i1>, <32 x i1>* %a0
927 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
928 store <4 x i1> %d1, <4 x i1>* %a1
931 define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
932 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
934 ; AVX512-NEXT: kmovb 16(%rdi), %k0
935 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
936 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
937 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
938 ; AVX512-NEXT: kmovb %k0, (%rsi)
939 ; AVX512-NEXT: vzeroupper
942 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
943 ; AVX512NOTDQ: # %bb.0:
944 ; AVX512NOTDQ-NEXT: kmovw 16(%rdi), %k1
945 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
946 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
947 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
948 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
949 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
950 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
951 ; AVX512NOTDQ-NEXT: vzeroupper
952 ; AVX512NOTDQ-NEXT: retq
953 %d0 = load <32 x i1>, <32 x i1>* %a0
954 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
955 store <8 x i1> %d1, <8 x i1>* %a1
958 define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
959 ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
961 ; AVX512-NEXT: kmovb 31(%rdi), %k0
962 ; AVX512-NEXT: kmovb %k0, (%rsi)
965 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
966 ; AVX512NOTDQ: # %bb.0:
967 ; AVX512NOTDQ-NEXT: movb 31(%rdi), %al
968 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
969 ; AVX512NOTDQ-NEXT: retq
970 %d0 = load <32 x i1>, <32 x i1>* %a0
971 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
972 store <1 x i1> %d1, <1 x i1>* %a1
975 define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
976 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
978 ; AVX512-NEXT: kmovb 30(%rdi), %k0
979 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
980 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
981 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
982 ; AVX512-NEXT: kmovb %k0, (%rsi)
985 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
986 ; AVX512NOTDQ: # %bb.0:
987 ; AVX512NOTDQ-NEXT: kmovw 30(%rdi), %k1
988 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
989 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
990 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
991 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
992 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
993 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
994 ; AVX512NOTDQ-NEXT: retq
995 %d0 = load <32 x i1>, <32 x i1>* %a0
996 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
997 store <2 x i1> %d1, <2 x i1>* %a1
1000 define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1001 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1003 ; AVX512-NEXT: kmovb 28(%rdi), %k0
1004 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1005 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1006 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1007 ; AVX512-NEXT: kmovb %k0, (%rsi)
1010 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1011 ; AVX512NOTDQ: # %bb.0:
1012 ; AVX512NOTDQ-NEXT: kmovw 28(%rdi), %k1
1013 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1014 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1015 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1016 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1017 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1018 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1019 ; AVX512NOTDQ-NEXT: retq
1020 %d0 = load <32 x i1>, <32 x i1>* %a0
1021 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1022 store <4 x i1> %d1, <4 x i1>* %a1
1025 define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1026 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
1028 ; AVX512-NEXT: kmovb 24(%rdi), %k0
1029 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1030 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1031 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
1032 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1033 ; AVX512-NEXT: kmovb %k0, (%rsi)
1034 ; AVX512-NEXT: vzeroupper
1037 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
1038 ; AVX512NOTDQ: # %bb.0:
1039 ; AVX512NOTDQ-NEXT: kmovw 24(%rdi), %k1
1040 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1041 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1042 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1043 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
1044 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1045 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1046 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1047 ; AVX512NOTDQ-NEXT: vzeroupper
1048 ; AVX512NOTDQ-NEXT: retq
1049 %d0 = load <32 x i1>, <32 x i1>* %a0
1050 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1051 store <8 x i1> %d1, <8 x i1>* %a1
1054 define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1055 ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1057 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1058 ; AVX512-NEXT: kmovb %k0, (%rsi)
1061 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1062 ; AVX512NOTDQ: # %bb.0:
1063 ; AVX512NOTDQ-NEXT: movb 32(%rdi), %al
1064 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1065 ; AVX512NOTDQ-NEXT: retq
1066 %d0 = load <64 x i1>, <64 x i1>* %a0
1067 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1068 store <1 x i1> %d1, <1 x i1>* %a1
1071 define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1072 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1074 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1075 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1076 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1077 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1078 ; AVX512-NEXT: kmovb %k0, (%rsi)
1081 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1082 ; AVX512NOTDQ: # %bb.0:
1083 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
1084 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1085 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1086 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1087 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1088 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1089 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1090 ; AVX512NOTDQ-NEXT: retq
1091 %d0 = load <64 x i1>, <64 x i1>* %a0
1092 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1093 store <2 x i1> %d1, <2 x i1>* %a1
1096 define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1097 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1099 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1100 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1101 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1102 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1103 ; AVX512-NEXT: kmovb %k0, (%rsi)
1106 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1107 ; AVX512NOTDQ: # %bb.0:
1108 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
1109 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1110 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1111 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1112 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1113 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1114 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1115 ; AVX512NOTDQ-NEXT: retq
1116 %d0 = load <64 x i1>, <64 x i1>* %a0
1117 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1118 store <4 x i1> %d1, <4 x i1>* %a1
1121 define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1122 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1124 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1125 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1126 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1127 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1128 ; AVX512-NEXT: kmovb %k0, (%rsi)
1129 ; AVX512-NEXT: vzeroupper
1132 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1133 ; AVX512NOTDQ: # %bb.0:
1134 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
1135 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1136 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1137 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1138 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1139 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1140 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1141 ; AVX512NOTDQ-NEXT: vzeroupper
1142 ; AVX512NOTDQ-NEXT: retq
1143 %d0 = load <64 x i1>, <64 x i1>* %a0
1144 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1145 store <8 x i1> %d1, <8 x i1>* %a1
1148 define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1149 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1151 ; AVX512-NEXT: kmovw 32(%rdi), %k0
1152 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1153 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
1154 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1155 ; AVX512-NEXT: kmovw %k0, (%rsi)
1156 ; AVX512-NEXT: vzeroupper
1159 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1160 ; AVX512NOTDQ: # %bb.0:
1161 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
1162 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1163 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
1164 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1165 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1166 ; AVX512NOTDQ-NEXT: vzeroupper
1167 ; AVX512NOTDQ-NEXT: retq
1168 %d0 = load <64 x i1>, <64 x i1>* %a0
1169 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1170 store <16 x i1> %d1, <16 x i1>* %a1
1173 define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1174 ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1176 ; AVX512-NEXT: kmovb 63(%rdi), %k0
1177 ; AVX512-NEXT: kmovb %k0, (%rsi)
1180 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1181 ; AVX512NOTDQ: # %bb.0:
1182 ; AVX512NOTDQ-NEXT: movb 63(%rdi), %al
1183 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1184 ; AVX512NOTDQ-NEXT: retq
1185 %d0 = load <64 x i1>, <64 x i1>* %a0
1186 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1187 store <1 x i1> %d1, <1 x i1>* %a1
1190 define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1191 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1193 ; AVX512-NEXT: kmovb 62(%rdi), %k0
1194 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1195 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1196 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1197 ; AVX512-NEXT: kmovb %k0, (%rsi)
1200 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1201 ; AVX512NOTDQ: # %bb.0:
1202 ; AVX512NOTDQ-NEXT: kmovw 62(%rdi), %k1
1203 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1204 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1205 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1206 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1207 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1208 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1209 ; AVX512NOTDQ-NEXT: retq
1210 %d0 = load <64 x i1>, <64 x i1>* %a0
1211 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1212 store <2 x i1> %d1, <2 x i1>* %a1
1215 define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1216 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1218 ; AVX512-NEXT: kmovb 60(%rdi), %k0
1219 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1220 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1221 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1222 ; AVX512-NEXT: kmovb %k0, (%rsi)
1225 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1226 ; AVX512NOTDQ: # %bb.0:
1227 ; AVX512NOTDQ-NEXT: kmovw 60(%rdi), %k1
1228 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1229 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1230 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1231 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1232 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1233 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1234 ; AVX512NOTDQ-NEXT: retq
1235 %d0 = load <64 x i1>, <64 x i1>* %a0
1236 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1237 store <4 x i1> %d1, <4 x i1>* %a1
1240 define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1241 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
1243 ; AVX512-NEXT: kmovb 56(%rdi), %k0
1244 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1245 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1246 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
1247 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1248 ; AVX512-NEXT: kmovb %k0, (%rsi)
1249 ; AVX512-NEXT: vzeroupper
1252 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
1253 ; AVX512NOTDQ: # %bb.0:
1254 ; AVX512NOTDQ-NEXT: kmovw 56(%rdi), %k1
1255 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1256 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1257 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1258 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
1259 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1260 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1261 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1262 ; AVX512NOTDQ-NEXT: vzeroupper
1263 ; AVX512NOTDQ-NEXT: retq
1264 %d0 = load <64 x i1>, <64 x i1>* %a0
1265 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1266 store <8 x i1> %d1, <8 x i1>* %a1
1269 define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1270 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
1272 ; AVX512-NEXT: kmovw 48(%rdi), %k0
1273 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1274 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1275 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
1276 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1277 ; AVX512-NEXT: kmovw %k0, (%rsi)
1278 ; AVX512-NEXT: vzeroupper
1281 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
1282 ; AVX512NOTDQ: # %bb.0:
1283 ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1
1284 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1285 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1286 ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
1287 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1288 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1289 ; AVX512NOTDQ-NEXT: vzeroupper
1290 ; AVX512NOTDQ-NEXT: retq
1291 %d0 = load <64 x i1>, <64 x i1>* %a0
1292 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1293 store <16 x i1> %d1, <16 x i1>* %a1