1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
5 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
6 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
8 ; AVX512-NEXT: kmovb 4(%rdi), %k0
9 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
10 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
11 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
12 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
13 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
16 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
17 ; AVX512NOTDQ: # %bb.0:
18 ; AVX512NOTDQ-NEXT: movzbl 4(%rdi), %eax
19 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
20 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
21 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
22 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
23 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
24 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
25 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
26 ; AVX512NOTDQ-NEXT: retq
27 %d0 = load <8 x i1>, <8 x i1>* %a0
28 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
29 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
30 store <2 x double> %d2, <2 x double>* %a3
33 define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
34 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
36 ; AVX512-NEXT: kmovb 6(%rdi), %k0
37 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
38 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
39 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
40 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
41 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
44 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
45 ; AVX512NOTDQ: # %bb.0:
46 ; AVX512NOTDQ-NEXT: movzbl 6(%rdi), %eax
47 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
48 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
49 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
50 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
51 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
52 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
53 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
54 ; AVX512NOTDQ-NEXT: retq
55 %d0 = load <8 x i1>, <8 x i1>* %a0
56 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
57 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
58 store <2 x double> %d2, <2 x double>* %a3
61 define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
62 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
64 ; AVX512-NEXT: kmovb 8(%rdi), %k0
65 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
66 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
67 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
68 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
69 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
72 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
73 ; AVX512NOTDQ: # %bb.0:
74 ; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax
75 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
76 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
77 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
78 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
79 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
80 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
81 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
82 ; AVX512NOTDQ-NEXT: retq
83 %d0 = load <16 x i1>, <16 x i1>* %a0
84 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
85 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
86 store <2 x double> %d2, <2 x double>* %a3
89 define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
90 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
92 ; AVX512-NEXT: kmovb 8(%rdi), %k0
93 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
94 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
95 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
96 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
97 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
100 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
101 ; AVX512NOTDQ: # %bb.0:
102 ; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax
103 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
104 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
105 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
106 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
107 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
108 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
109 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
110 ; AVX512NOTDQ-NEXT: retq
111 %d0 = load <16 x i1>, <16 x i1>* %a0
112 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
113 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
114 store <4 x float> %d2, <4 x float>* %a3
117 define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
118 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
120 ; AVX512-NEXT: kmovb 14(%rdi), %k0
121 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
122 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
123 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
124 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
125 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
128 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
129 ; AVX512NOTDQ: # %bb.0:
130 ; AVX512NOTDQ-NEXT: movzbl 14(%rdi), %eax
131 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
132 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
133 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
134 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
135 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
136 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
137 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
138 ; AVX512NOTDQ-NEXT: retq
139 %d0 = load <16 x i1>, <16 x i1>* %a0
140 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
141 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
142 store <2 x double> %d2, <2 x double>* %a3
145 define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
146 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
148 ; AVX512-NEXT: kmovb 12(%rdi), %k0
149 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
150 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
151 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
152 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
153 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
156 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
157 ; AVX512NOTDQ: # %bb.0:
158 ; AVX512NOTDQ-NEXT: movzbl 12(%rdi), %eax
159 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
160 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
161 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
162 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
163 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
164 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
165 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
166 ; AVX512NOTDQ-NEXT: retq
167 %d0 = load <16 x i1>, <16 x i1>* %a0
168 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
169 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
170 store <4 x float> %d2, <4 x float>* %a3
173 define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
174 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
176 ; AVX512-NEXT: kmovb 16(%rdi), %k0
177 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
178 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
179 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
180 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
181 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
184 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
185 ; AVX512NOTDQ: # %bb.0:
186 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
187 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
188 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
189 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
190 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
191 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
192 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
193 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
194 ; AVX512NOTDQ-NEXT: retq
195 %d0 = load <32 x i1>, <32 x i1>* %a0
196 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
197 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
198 store <2 x double> %d2, <2 x double>* %a3
201 define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
202 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
204 ; AVX512-NEXT: kmovb 16(%rdi), %k0
205 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
206 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
207 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
208 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
209 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
212 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
213 ; AVX512NOTDQ: # %bb.0:
214 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
215 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
216 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
217 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
218 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
219 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
220 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
221 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
222 ; AVX512NOTDQ-NEXT: retq
223 %d0 = load <32 x i1>, <32 x i1>* %a0
224 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
225 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
226 store <4 x float> %d2, <4 x float>* %a3
229 define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
230 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
232 ; AVX512-NEXT: kmovb 16(%rdi), %k0
233 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
234 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
235 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
236 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
237 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
238 ; AVX512-NEXT: vzeroupper
241 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
242 ; AVX512NOTDQ: # %bb.0:
243 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
244 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
245 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
246 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
247 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
248 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
249 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
250 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
251 ; AVX512NOTDQ-NEXT: vzeroupper
252 ; AVX512NOTDQ-NEXT: retq
253 %d0 = load <32 x i1>, <32 x i1>* %a0
254 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
255 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
256 store <8 x float> %d2, <8 x float>* %a3
259 define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
260 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
262 ; AVX512-NEXT: kmovb 30(%rdi), %k0
263 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
264 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
265 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
266 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
267 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
270 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
271 ; AVX512NOTDQ: # %bb.0:
272 ; AVX512NOTDQ-NEXT: movzbl 30(%rdi), %eax
273 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
274 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
275 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
276 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
277 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
278 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
279 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
280 ; AVX512NOTDQ-NEXT: retq
281 %d0 = load <32 x i1>, <32 x i1>* %a0
282 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
283 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
284 store <2 x double> %d2, <2 x double>* %a3
287 define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
288 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
290 ; AVX512-NEXT: kmovb 28(%rdi), %k0
291 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
292 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
293 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
294 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
295 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
298 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
299 ; AVX512NOTDQ: # %bb.0:
300 ; AVX512NOTDQ-NEXT: movzbl 28(%rdi), %eax
301 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
302 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
303 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
304 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
305 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
306 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
307 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
308 ; AVX512NOTDQ-NEXT: retq
309 %d0 = load <32 x i1>, <32 x i1>* %a0
310 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
311 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
312 store <4 x float> %d2, <4 x float>* %a3
315 define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
316 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
318 ; AVX512-NEXT: kmovb 24(%rdi), %k0
319 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
320 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
321 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
322 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
323 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
324 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
325 ; AVX512-NEXT: vzeroupper
328 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
329 ; AVX512NOTDQ: # %bb.0:
330 ; AVX512NOTDQ-NEXT: movzbl 24(%rdi), %eax
331 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
332 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
333 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
334 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
335 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
336 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
337 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
338 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
339 ; AVX512NOTDQ-NEXT: vzeroupper
340 ; AVX512NOTDQ-NEXT: retq
341 %d0 = load <32 x i1>, <32 x i1>* %a0
342 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
343 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
344 store <8 x float> %d2, <8 x float>* %a3
347 define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
348 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
350 ; AVX512-NEXT: kmovb 32(%rdi), %k0
351 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
352 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
353 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
354 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
355 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
358 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
359 ; AVX512NOTDQ: # %bb.0:
360 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
361 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
362 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
363 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
364 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
365 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
366 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
367 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
368 ; AVX512NOTDQ-NEXT: retq
369 %d0 = load <64 x i1>, <64 x i1>* %a0
370 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
371 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
372 store <2 x double> %d2, <2 x double>* %a3
375 define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
376 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
378 ; AVX512-NEXT: kmovb 32(%rdi), %k0
379 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
380 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
381 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
382 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
383 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
386 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
387 ; AVX512NOTDQ: # %bb.0:
388 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
389 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
390 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
391 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
392 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
393 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
394 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
395 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
396 ; AVX512NOTDQ-NEXT: retq
397 %d0 = load <64 x i1>, <64 x i1>* %a0
398 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
399 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
400 store <4 x float> %d2, <4 x float>* %a3
403 define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
404 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
406 ; AVX512-NEXT: kmovb 32(%rdi), %k0
407 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
408 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
409 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
410 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
411 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
412 ; AVX512-NEXT: vzeroupper
415 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
416 ; AVX512NOTDQ: # %bb.0:
417 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
418 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
419 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
420 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
421 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
422 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
423 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
424 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
425 ; AVX512NOTDQ-NEXT: vzeroupper
426 ; AVX512NOTDQ-NEXT: retq
427 %d0 = load <64 x i1>, <64 x i1>* %a0
428 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
429 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
430 store <8 x float> %d2, <8 x float>* %a3
433 define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
434 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
436 ; AVX512-NEXT: kmovw 32(%rdi), %k0
437 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
438 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
439 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
440 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
441 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
442 ; AVX512-NEXT: vzeroupper
445 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
446 ; AVX512NOTDQ: # %bb.0:
447 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
448 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
449 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
450 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
451 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
452 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
453 ; AVX512NOTDQ-NEXT: vzeroupper
454 ; AVX512NOTDQ-NEXT: retq
455 %d0 = load <64 x i1>, <64 x i1>* %a0
456 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
457 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
458 store <16 x float> %d2, <16 x float>* %a3
461 define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
462 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
464 ; AVX512-NEXT: kmovb 62(%rdi), %k0
465 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
466 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
467 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
468 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
469 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
472 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
473 ; AVX512NOTDQ: # %bb.0:
474 ; AVX512NOTDQ-NEXT: movzbl 62(%rdi), %eax
475 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
476 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
477 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
478 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
479 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
480 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
481 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
482 ; AVX512NOTDQ-NEXT: retq
483 %d0 = load <64 x i1>, <64 x i1>* %a0
484 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
485 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
486 store <2 x double> %d2, <2 x double>* %a3
489 define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
490 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
492 ; AVX512-NEXT: kmovb 60(%rdi), %k0
493 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
494 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
495 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
496 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
497 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
500 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
501 ; AVX512NOTDQ: # %bb.0:
502 ; AVX512NOTDQ-NEXT: movzbl 60(%rdi), %eax
503 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
504 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
505 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
506 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
507 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
508 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
509 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
510 ; AVX512NOTDQ-NEXT: retq
511 %d0 = load <64 x i1>, <64 x i1>* %a0
512 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
513 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
514 store <4 x float> %d2, <4 x float>* %a3
517 define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
518 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
520 ; AVX512-NEXT: kmovb 56(%rdi), %k0
521 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
522 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
523 ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
524 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
525 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
526 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
527 ; AVX512-NEXT: vzeroupper
530 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
531 ; AVX512NOTDQ: # %bb.0:
532 ; AVX512NOTDQ-NEXT: movzbl 56(%rdi), %eax
533 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
534 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
535 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
536 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
537 ; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
538 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
539 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
540 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
541 ; AVX512NOTDQ-NEXT: vzeroupper
542 ; AVX512NOTDQ-NEXT: retq
543 %d0 = load <64 x i1>, <64 x i1>* %a0
544 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
545 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
546 store <8 x float> %d2, <8 x float>* %a3
549 define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
550 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
552 ; AVX512-NEXT: kmovw 48(%rdi), %k0
553 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
554 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
555 ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
556 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
557 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
558 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
559 ; AVX512-NEXT: vzeroupper
562 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
563 ; AVX512NOTDQ: # %bb.0:
564 ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1
565 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
566 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
567 ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
568 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
569 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
570 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
571 ; AVX512NOTDQ-NEXT: vzeroupper
572 ; AVX512NOTDQ-NEXT: retq
573 %d0 = load <64 x i1>, <64 x i1>* %a0
574 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
575 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
576 store <16 x float> %d2, <16 x float>* %a3
579 define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
580 ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
582 ; AVX512-NEXT: kmovb 1(%rdi), %k0
583 ; AVX512-NEXT: kmovb %k0, (%rsi)
586 ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
587 ; AVX512NOTDQ: # %bb.0:
588 ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al
589 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
590 ; AVX512NOTDQ-NEXT: retq
591 %d0 = load <2 x i1>, <2 x i1>* %a0
592 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
593 store <1 x i1> %d1, <1 x i1>* %a1
596 define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
597 ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
599 ; AVX512-NEXT: kmovb 1(%rdi), %k0
600 ; AVX512-NEXT: kmovb %k0, (%rsi)
603 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
604 ; AVX512NOTDQ: # %bb.0:
605 ; AVX512NOTDQ-NEXT: movb 1(%rdi), %al
606 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
607 ; AVX512NOTDQ-NEXT: retq
608 %d0 = load <3 x i1>, <3 x i1>* %a0
609 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
610 store <1 x i1> %d1, <1 x i1>* %a1
613 define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
614 ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
616 ; AVX512-NEXT: kmovb 2(%rdi), %k0
617 ; AVX512-NEXT: kmovb %k0, (%rsi)
620 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
621 ; AVX512NOTDQ: # %bb.0:
622 ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al
623 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
624 ; AVX512NOTDQ-NEXT: retq
625 %d0 = load <3 x i1>, <3 x i1>* %a0
626 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
627 store <1 x i1> %d1, <1 x i1>* %a1
630 define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
631 ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
633 ; AVX512-NEXT: kmovb 2(%rdi), %k0
634 ; AVX512-NEXT: kmovb %k0, (%rsi)
637 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
638 ; AVX512NOTDQ: # %bb.0:
639 ; AVX512NOTDQ-NEXT: movb 2(%rdi), %al
640 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
641 ; AVX512NOTDQ-NEXT: retq
642 %d0 = load <4 x i1>, <4 x i1>* %a0
643 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
644 store <1 x i1> %d1, <1 x i1>* %a1
647 define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
648 ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
650 ; AVX512-NEXT: kmovb 3(%rdi), %k0
651 ; AVX512-NEXT: kmovb %k0, (%rsi)
654 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
655 ; AVX512NOTDQ: # %bb.0:
656 ; AVX512NOTDQ-NEXT: movb 3(%rdi), %al
657 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
658 ; AVX512NOTDQ-NEXT: retq
659 %d0 = load <4 x i1>, <4 x i1>* %a0
660 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
661 store <1 x i1> %d1, <1 x i1>* %a1
664 define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
665 ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
667 ; AVX512-NEXT: kmovb 4(%rdi), %k0
668 ; AVX512-NEXT: kmovb %k0, (%rsi)
671 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
672 ; AVX512NOTDQ: # %bb.0:
673 ; AVX512NOTDQ-NEXT: movb 4(%rdi), %al
674 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
675 ; AVX512NOTDQ-NEXT: retq
676 %d0 = load <8 x i1>, <8 x i1>* %a0
677 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
678 store <1 x i1> %d1, <1 x i1>* %a1
681 define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
682 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
684 ; AVX512-NEXT: kmovb 4(%rdi), %k0
685 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
686 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
687 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
688 ; AVX512-NEXT: kmovb %k0, (%rsi)
691 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
692 ; AVX512NOTDQ: # %bb.0:
693 ; AVX512NOTDQ-NEXT: movzbl 4(%rdi), %eax
694 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
695 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
696 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
697 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
698 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
699 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
700 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
701 ; AVX512NOTDQ-NEXT: retq
702 %d0 = load <8 x i1>, <8 x i1>* %a0
703 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
704 store <2 x i1> %d1, <2 x i1>* %a1
707 define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
708 ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
710 ; AVX512-NEXT: kmovb 7(%rdi), %k0
711 ; AVX512-NEXT: kmovb %k0, (%rsi)
714 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
715 ; AVX512NOTDQ: # %bb.0:
716 ; AVX512NOTDQ-NEXT: movb 7(%rdi), %al
717 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
718 ; AVX512NOTDQ-NEXT: retq
719 %d0 = load <8 x i1>, <8 x i1>* %a0
720 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
721 store <1 x i1> %d1, <1 x i1>* %a1
724 define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
725 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
727 ; AVX512-NEXT: kmovb 6(%rdi), %k0
728 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
729 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
730 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
731 ; AVX512-NEXT: kmovb %k0, (%rsi)
734 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
735 ; AVX512NOTDQ: # %bb.0:
736 ; AVX512NOTDQ-NEXT: movzbl 6(%rdi), %eax
737 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
738 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
739 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
740 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
741 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
742 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
743 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
744 ; AVX512NOTDQ-NEXT: retq
745 %d0 = load <8 x i1>, <8 x i1>* %a0
746 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
747 store <2 x i1> %d1, <2 x i1>* %a1
750 define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
751 ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
753 ; AVX512-NEXT: kmovb 8(%rdi), %k0
754 ; AVX512-NEXT: kmovb %k0, (%rsi)
757 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
758 ; AVX512NOTDQ: # %bb.0:
759 ; AVX512NOTDQ-NEXT: movb 8(%rdi), %al
760 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
761 ; AVX512NOTDQ-NEXT: retq
762 %d0 = load <16 x i1>, <16 x i1>* %a0
763 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
764 store <1 x i1> %d1, <1 x i1>* %a1
767 define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
768 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
770 ; AVX512-NEXT: kmovb 8(%rdi), %k0
771 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
772 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
773 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
774 ; AVX512-NEXT: kmovb %k0, (%rsi)
777 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
778 ; AVX512NOTDQ: # %bb.0:
779 ; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax
780 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
781 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
782 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
783 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
784 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
785 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
786 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
787 ; AVX512NOTDQ-NEXT: retq
788 %d0 = load <16 x i1>, <16 x i1>* %a0
789 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
790 store <2 x i1> %d1, <2 x i1>* %a1
793 define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
794 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
796 ; AVX512-NEXT: kmovb 8(%rdi), %k0
797 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
798 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
799 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
800 ; AVX512-NEXT: kmovb %k0, (%rsi)
803 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
804 ; AVX512NOTDQ: # %bb.0:
805 ; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax
806 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
807 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
808 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
809 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
810 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
811 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
812 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
813 ; AVX512NOTDQ-NEXT: retq
814 %d0 = load <16 x i1>, <16 x i1>* %a0
815 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
816 store <4 x i1> %d1, <4 x i1>* %a1
819 define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
820 ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
822 ; AVX512-NEXT: kmovb 15(%rdi), %k0
823 ; AVX512-NEXT: kmovb %k0, (%rsi)
826 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
827 ; AVX512NOTDQ: # %bb.0:
828 ; AVX512NOTDQ-NEXT: movb 15(%rdi), %al
829 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
830 ; AVX512NOTDQ-NEXT: retq
831 %d0 = load <16 x i1>, <16 x i1>* %a0
832 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
833 store <1 x i1> %d1, <1 x i1>* %a1
836 define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
837 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
839 ; AVX512-NEXT: kmovb 14(%rdi), %k0
840 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
841 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
842 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
843 ; AVX512-NEXT: kmovb %k0, (%rsi)
846 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
847 ; AVX512NOTDQ: # %bb.0:
848 ; AVX512NOTDQ-NEXT: movzbl 14(%rdi), %eax
849 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
850 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
851 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
852 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
853 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
854 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
855 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
856 ; AVX512NOTDQ-NEXT: retq
857 %d0 = load <16 x i1>, <16 x i1>* %a0
858 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
859 store <2 x i1> %d1, <2 x i1>* %a1
862 define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
863 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
865 ; AVX512-NEXT: kmovb 12(%rdi), %k0
866 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
867 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
868 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
869 ; AVX512-NEXT: kmovb %k0, (%rsi)
872 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
873 ; AVX512NOTDQ: # %bb.0:
874 ; AVX512NOTDQ-NEXT: movzbl 12(%rdi), %eax
875 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
876 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
877 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
878 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
879 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
880 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
881 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
882 ; AVX512NOTDQ-NEXT: retq
883 %d0 = load <16 x i1>, <16 x i1>* %a0
884 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
885 store <4 x i1> %d1, <4 x i1>* %a1
888 define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
889 ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
891 ; AVX512-NEXT: kmovb 16(%rdi), %k0
892 ; AVX512-NEXT: kmovb %k0, (%rsi)
895 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
896 ; AVX512NOTDQ: # %bb.0:
897 ; AVX512NOTDQ-NEXT: movb 16(%rdi), %al
898 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
899 ; AVX512NOTDQ-NEXT: retq
900 %d0 = load <32 x i1>, <32 x i1>* %a0
901 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
902 store <1 x i1> %d1, <1 x i1>* %a1
905 define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
906 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
908 ; AVX512-NEXT: kmovb 16(%rdi), %k0
909 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
910 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
911 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
912 ; AVX512-NEXT: kmovb %k0, (%rsi)
915 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
916 ; AVX512NOTDQ: # %bb.0:
917 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
918 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
919 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
920 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
921 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
922 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
923 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
924 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
925 ; AVX512NOTDQ-NEXT: retq
926 %d0 = load <32 x i1>, <32 x i1>* %a0
927 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
928 store <2 x i1> %d1, <2 x i1>* %a1
931 define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
932 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
934 ; AVX512-NEXT: kmovb 16(%rdi), %k0
935 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
936 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
937 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
938 ; AVX512-NEXT: kmovb %k0, (%rsi)
941 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
942 ; AVX512NOTDQ: # %bb.0:
943 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
944 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
945 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
946 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
947 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
948 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
949 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
950 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
951 ; AVX512NOTDQ-NEXT: retq
952 %d0 = load <32 x i1>, <32 x i1>* %a0
953 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
954 store <4 x i1> %d1, <4 x i1>* %a1
957 define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
958 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
960 ; AVX512-NEXT: kmovb 16(%rdi), %k0
961 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
962 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
963 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
964 ; AVX512-NEXT: kmovb %k0, (%rsi)
965 ; AVX512-NEXT: vzeroupper
968 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
969 ; AVX512NOTDQ: # %bb.0:
970 ; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax
971 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
972 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
973 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
974 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
975 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
976 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
977 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
978 ; AVX512NOTDQ-NEXT: vzeroupper
979 ; AVX512NOTDQ-NEXT: retq
980 %d0 = load <32 x i1>, <32 x i1>* %a0
981 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
982 store <8 x i1> %d1, <8 x i1>* %a1
985 define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
986 ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
988 ; AVX512-NEXT: kmovb 31(%rdi), %k0
989 ; AVX512-NEXT: kmovb %k0, (%rsi)
992 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
993 ; AVX512NOTDQ: # %bb.0:
994 ; AVX512NOTDQ-NEXT: movb 31(%rdi), %al
995 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
996 ; AVX512NOTDQ-NEXT: retq
997 %d0 = load <32 x i1>, <32 x i1>* %a0
998 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
999 store <1 x i1> %d1, <1 x i1>* %a1
1002 define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1003 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1005 ; AVX512-NEXT: kmovb 30(%rdi), %k0
1006 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1007 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1008 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1009 ; AVX512-NEXT: kmovb %k0, (%rsi)
1012 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1013 ; AVX512NOTDQ: # %bb.0:
1014 ; AVX512NOTDQ-NEXT: movzbl 30(%rdi), %eax
1015 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1016 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1017 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1018 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1019 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1020 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1021 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1022 ; AVX512NOTDQ-NEXT: retq
1023 %d0 = load <32 x i1>, <32 x i1>* %a0
1024 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1025 store <2 x i1> %d1, <2 x i1>* %a1
1028 define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1029 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1031 ; AVX512-NEXT: kmovb 28(%rdi), %k0
1032 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1033 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1034 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1035 ; AVX512-NEXT: kmovb %k0, (%rsi)
1038 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1039 ; AVX512NOTDQ: # %bb.0:
1040 ; AVX512NOTDQ-NEXT: movzbl 28(%rdi), %eax
1041 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1042 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1043 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1044 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1045 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1046 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1047 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1048 ; AVX512NOTDQ-NEXT: retq
1049 %d0 = load <32 x i1>, <32 x i1>* %a0
1050 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1051 store <4 x i1> %d1, <4 x i1>* %a1
1054 define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1055 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
1057 ; AVX512-NEXT: kmovb 24(%rdi), %k0
1058 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1059 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1060 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
1061 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1062 ; AVX512-NEXT: kmovb %k0, (%rsi)
1063 ; AVX512-NEXT: vzeroupper
1066 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
1067 ; AVX512NOTDQ: # %bb.0:
1068 ; AVX512NOTDQ-NEXT: movzbl 24(%rdi), %eax
1069 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1070 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1071 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1072 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1073 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
1074 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1075 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1076 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1077 ; AVX512NOTDQ-NEXT: vzeroupper
1078 ; AVX512NOTDQ-NEXT: retq
1079 %d0 = load <32 x i1>, <32 x i1>* %a0
1080 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1081 store <8 x i1> %d1, <8 x i1>* %a1
1084 define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1085 ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1087 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1088 ; AVX512-NEXT: kmovb %k0, (%rsi)
1091 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1092 ; AVX512NOTDQ: # %bb.0:
1093 ; AVX512NOTDQ-NEXT: movb 32(%rdi), %al
1094 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1095 ; AVX512NOTDQ-NEXT: retq
1096 %d0 = load <64 x i1>, <64 x i1>* %a0
1097 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1098 store <1 x i1> %d1, <1 x i1>* %a1
1101 define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1102 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1104 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1105 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1106 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1107 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1108 ; AVX512-NEXT: kmovb %k0, (%rsi)
1111 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1112 ; AVX512NOTDQ: # %bb.0:
1113 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
1114 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1115 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1116 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1117 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1118 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1119 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1120 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1121 ; AVX512NOTDQ-NEXT: retq
1122 %d0 = load <64 x i1>, <64 x i1>* %a0
1123 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1124 store <2 x i1> %d1, <2 x i1>* %a1
1127 define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1128 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1130 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1131 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1132 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1133 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1134 ; AVX512-NEXT: kmovb %k0, (%rsi)
1137 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1138 ; AVX512NOTDQ: # %bb.0:
1139 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
1140 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1141 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1142 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1143 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1144 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1145 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1146 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1147 ; AVX512NOTDQ-NEXT: retq
1148 %d0 = load <64 x i1>, <64 x i1>* %a0
1149 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1150 store <4 x i1> %d1, <4 x i1>* %a1
1153 define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1154 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1156 ; AVX512-NEXT: kmovb 32(%rdi), %k0
1157 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1158 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1159 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1160 ; AVX512-NEXT: kmovb %k0, (%rsi)
1161 ; AVX512-NEXT: vzeroupper
1164 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1165 ; AVX512NOTDQ: # %bb.0:
1166 ; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax
1167 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1168 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1169 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1170 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1171 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1172 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1173 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1174 ; AVX512NOTDQ-NEXT: vzeroupper
1175 ; AVX512NOTDQ-NEXT: retq
1176 %d0 = load <64 x i1>, <64 x i1>* %a0
1177 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1178 store <8 x i1> %d1, <8 x i1>* %a1
1181 define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1182 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1184 ; AVX512-NEXT: kmovw 32(%rdi), %k0
1185 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1186 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
1187 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1188 ; AVX512-NEXT: kmovw %k0, (%rsi)
1189 ; AVX512-NEXT: vzeroupper
1192 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1193 ; AVX512NOTDQ: # %bb.0:
1194 ; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1
1195 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1196 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
1197 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1198 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1199 ; AVX512NOTDQ-NEXT: vzeroupper
1200 ; AVX512NOTDQ-NEXT: retq
1201 %d0 = load <64 x i1>, <64 x i1>* %a0
1202 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1203 store <16 x i1> %d1, <16 x i1>* %a1
1206 define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1207 ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1209 ; AVX512-NEXT: kmovb 63(%rdi), %k0
1210 ; AVX512-NEXT: kmovb %k0, (%rsi)
1213 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1214 ; AVX512NOTDQ: # %bb.0:
1215 ; AVX512NOTDQ-NEXT: movb 63(%rdi), %al
1216 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1217 ; AVX512NOTDQ-NEXT: retq
1218 %d0 = load <64 x i1>, <64 x i1>* %a0
1219 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1220 store <1 x i1> %d1, <1 x i1>* %a1
1223 define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1224 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1226 ; AVX512-NEXT: kmovb 62(%rdi), %k0
1227 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1228 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1229 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1230 ; AVX512-NEXT: kmovb %k0, (%rsi)
1233 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1234 ; AVX512NOTDQ: # %bb.0:
1235 ; AVX512NOTDQ-NEXT: movzbl 62(%rdi), %eax
1236 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1237 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1238 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1239 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1240 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1241 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1242 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1243 ; AVX512NOTDQ-NEXT: retq
1244 %d0 = load <64 x i1>, <64 x i1>* %a0
1245 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1246 store <2 x i1> %d1, <2 x i1>* %a1
1249 define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1250 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1252 ; AVX512-NEXT: kmovb 60(%rdi), %k0
1253 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1254 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1255 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1256 ; AVX512-NEXT: kmovb %k0, (%rsi)
1259 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1260 ; AVX512NOTDQ: # %bb.0:
1261 ; AVX512NOTDQ-NEXT: movzbl 60(%rdi), %eax
1262 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1263 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1264 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1265 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1266 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1267 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1268 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1269 ; AVX512NOTDQ-NEXT: retq
1270 %d0 = load <64 x i1>, <64 x i1>* %a0
1271 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1272 store <4 x i1> %d1, <4 x i1>* %a1
1275 define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1276 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
1278 ; AVX512-NEXT: kmovb 56(%rdi), %k0
1279 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1280 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1281 ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
1282 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1283 ; AVX512-NEXT: kmovb %k0, (%rsi)
1284 ; AVX512-NEXT: vzeroupper
1287 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
1288 ; AVX512NOTDQ: # %bb.0:
1289 ; AVX512NOTDQ-NEXT: movzbl 56(%rdi), %eax
1290 ; AVX512NOTDQ-NEXT: kmovd %eax, %k1
1291 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1292 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1293 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1294 ; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
1295 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1296 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1297 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1298 ; AVX512NOTDQ-NEXT: vzeroupper
1299 ; AVX512NOTDQ-NEXT: retq
1300 %d0 = load <64 x i1>, <64 x i1>* %a0
1301 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1302 store <8 x i1> %d1, <8 x i1>* %a1
1305 define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1306 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
1308 ; AVX512-NEXT: kmovw 48(%rdi), %k0
1309 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1310 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1311 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
1312 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1313 ; AVX512-NEXT: kmovw %k0, (%rsi)
1314 ; AVX512-NEXT: vzeroupper
1317 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
1318 ; AVX512NOTDQ: # %bb.0:
1319 ; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1
1320 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1321 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1322 ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
1323 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1324 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1325 ; AVX512NOTDQ-NEXT: vzeroupper
1326 ; AVX512NOTDQ-NEXT: retq
1327 %d0 = load <64 x i1>, <64 x i1>* %a0
1328 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1329 store <16 x i1> %d1, <16 x i1>* %a1