1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=x86_64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX2
3 ; RUN: llc -mtriple=x86_64 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=CHECK,AVX512F
4 ; RUN: llc -mtriple=x86_64 -mattr=+avx512f,+avx512vl,+avx512vbmi2 < %s | FileCheck %s --check-prefixes=CHECK,AVX512VL
6 define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
7 ; AVX2-LABEL: test_compress_v4i32:
9 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
10 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
11 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
12 ; AVX2-NEXT: vpextrd $1, %xmm1, %eax
13 ; AVX2-NEXT: vmovd %xmm1, %esi
14 ; AVX2-NEXT: andl $1, %esi
15 ; AVX2-NEXT: movl %esi, %edi
16 ; AVX2-NEXT: subl %eax, %edi
17 ; AVX2-NEXT: vpextrd $2, %xmm1, %edx
18 ; AVX2-NEXT: subl %edx, %edi
19 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
20 ; AVX2-NEXT: subl %ecx, %edi
21 ; AVX2-NEXT: andl $3, %edi
22 ; AVX2-NEXT: andl $1, %eax
23 ; AVX2-NEXT: addq %rsi, %rax
24 ; AVX2-NEXT: andl $1, %edx
25 ; AVX2-NEXT: addq %rax, %rdx
26 ; AVX2-NEXT: andl $1, %ecx
27 ; AVX2-NEXT: addq %rdx, %rcx
28 ; AVX2-NEXT: vextractps $3, %xmm0, %r8d
29 ; AVX2-NEXT: cmpq $4, %rcx
30 ; AVX2-NEXT: cmovbl -24(%rsp,%rdi,4), %r8d
31 ; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
32 ; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rsi,4)
33 ; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rax,4)
34 ; AVX2-NEXT: andl $3, %edx
35 ; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rdx,4)
36 ; AVX2-NEXT: cmpq $3, %rcx
37 ; AVX2-NEXT: movl $3, %eax
38 ; AVX2-NEXT: cmovbq %rcx, %rax
39 ; AVX2-NEXT: movl %r8d, -24(%rsp,%rax,4)
40 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
43 ; AVX512F-LABEL: test_compress_v4i32:
45 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
46 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
47 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
48 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
49 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
50 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
51 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1}
52 ; AVX512F-NEXT: vmovdqa %xmm2, %xmm0
53 ; AVX512F-NEXT: vzeroupper
56 ; AVX512VL-LABEL: test_compress_v4i32:
58 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
59 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
60 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm2 {%k1}
61 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
63 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
67 define <4 x float> @test_compress_v4f32(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru) {
68 ; AVX2-LABEL: test_compress_v4f32:
70 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
71 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
72 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
73 ; AVX2-NEXT: vpextrd $1, %xmm1, %edx
74 ; AVX2-NEXT: vmovd %xmm1, %esi
75 ; AVX2-NEXT: andl $1, %esi
76 ; AVX2-NEXT: movl %esi, %edi
77 ; AVX2-NEXT: subl %edx, %edi
78 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
79 ; AVX2-NEXT: subl %ecx, %edi
80 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax
81 ; AVX2-NEXT: subl %eax, %edi
82 ; AVX2-NEXT: andl $3, %edi
83 ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
84 ; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
85 ; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rsi,4)
86 ; AVX2-NEXT: andl $1, %edx
87 ; AVX2-NEXT: addq %rsi, %rdx
88 ; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rdx,4)
89 ; AVX2-NEXT: andl $1, %ecx
90 ; AVX2-NEXT: addq %rdx, %rcx
91 ; AVX2-NEXT: andl $1, %eax
92 ; AVX2-NEXT: addq %rcx, %rax
93 ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
94 ; AVX2-NEXT: andl $3, %ecx
95 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
96 ; AVX2-NEXT: vmovss %xmm0, -24(%rsp,%rcx,4)
97 ; AVX2-NEXT: cmpq $3, %rax
98 ; AVX2-NEXT: movl $3, %ecx
99 ; AVX2-NEXT: cmovbq %rax, %rcx
100 ; AVX2-NEXT: ja .LBB1_2
101 ; AVX2-NEXT: # %bb.1:
102 ; AVX2-NEXT: vmovaps %xmm1, %xmm0
103 ; AVX2-NEXT: .LBB1_2:
104 ; AVX2-NEXT: vmovss %xmm0, -24(%rsp,%rcx,4)
105 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
108 ; AVX512F-LABEL: test_compress_v4f32:
110 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
111 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
112 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
113 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
114 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
115 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
116 ; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1}
117 ; AVX512F-NEXT: vmovdqa %xmm2, %xmm0
118 ; AVX512F-NEXT: vzeroupper
121 ; AVX512VL-LABEL: test_compress_v4f32:
123 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
124 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
125 ; AVX512VL-NEXT: vcompressps %xmm0, %xmm2 {%k1}
126 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
127 ; AVX512VL-NEXT: retq
128 %out = call <4 x float> @llvm.experimental.vector.compress(<4 x float> %vec, <4 x i1> %mask, <4 x float> %passthru)
132 define <2 x i64> @test_compress_v2i64(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru) {
133 ; AVX2-LABEL: test_compress_v2i64:
135 ; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
136 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
137 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
138 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
139 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
140 ; AVX2-NEXT: vmovq %xmm1, %rcx
141 ; AVX2-NEXT: movl %ecx, %edx
142 ; AVX2-NEXT: subl %eax, %edx
143 ; AVX2-NEXT: andl $1, %edx
144 ; AVX2-NEXT: andl $1, %eax
145 ; AVX2-NEXT: andl $1, %ecx
146 ; AVX2-NEXT: addq %rcx, %rax
147 ; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
148 ; AVX2-NEXT: cmpq $2, %rax
149 ; AVX2-NEXT: cmovbq -24(%rsp,%rdx,8), %rsi
150 ; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
151 ; AVX2-NEXT: movl %ecx, %ecx
152 ; AVX2-NEXT: vpextrq $1, %xmm0, -24(%rsp,%rcx,8)
153 ; AVX2-NEXT: cmpq $1, %rax
154 ; AVX2-NEXT: movl $1, %ecx
155 ; AVX2-NEXT: cmovbq %rax, %rcx
156 ; AVX2-NEXT: movq %rsi, -24(%rsp,%rcx,8)
157 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
160 ; AVX512F-LABEL: test_compress_v2i64:
162 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
163 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
164 ; AVX512F-NEXT: vpsllq $63, %xmm1, %xmm1
165 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
166 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
167 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
168 ; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
169 ; AVX512F-NEXT: vmovdqa %xmm2, %xmm0
170 ; AVX512F-NEXT: vzeroupper
173 ; AVX512VL-LABEL: test_compress_v2i64:
175 ; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
176 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
177 ; AVX512VL-NEXT: vpcompressq %xmm0, %xmm2 {%k1}
178 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
179 ; AVX512VL-NEXT: retq
180 %out = call <2 x i64> @llvm.experimental.vector.compress(<2 x i64> %vec, <2 x i1> %mask, <2 x i64> %passthru)
184 define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru) {
185 ; AVX2-LABEL: test_compress_v2f64:
187 ; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
188 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
189 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
190 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
191 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
192 ; AVX2-NEXT: vmovq %xmm1, %rcx
193 ; AVX2-NEXT: movl %ecx, %edx
194 ; AVX2-NEXT: subl %eax, %edx
195 ; AVX2-NEXT: andl $1, %edx
196 ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
197 ; AVX2-NEXT: vmovlpd %xmm0, -{{[0-9]+}}(%rsp)
198 ; AVX2-NEXT: andl $1, %ecx
199 ; AVX2-NEXT: movl %ecx, %edx
200 ; AVX2-NEXT: vmovhpd %xmm0, -24(%rsp,%rdx,8)
201 ; AVX2-NEXT: andl $1, %eax
202 ; AVX2-NEXT: addq %rcx, %rax
203 ; AVX2-NEXT: cmpq $2, %rax
204 ; AVX2-NEXT: jb .LBB3_2
205 ; AVX2-NEXT: # %bb.1:
206 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
207 ; AVX2-NEXT: .LBB3_2:
208 ; AVX2-NEXT: cmpq $1, %rax
209 ; AVX2-NEXT: movl $1, %ecx
210 ; AVX2-NEXT: cmovbq %rax, %rcx
211 ; AVX2-NEXT: vmovsd %xmm1, -24(%rsp,%rcx,8)
212 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
215 ; AVX512F-LABEL: test_compress_v2f64:
217 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
218 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
219 ; AVX512F-NEXT: vpsllq $63, %xmm1, %xmm1
220 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
221 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
222 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
223 ; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1}
224 ; AVX512F-NEXT: vmovdqa %xmm2, %xmm0
225 ; AVX512F-NEXT: vzeroupper
228 ; AVX512VL-LABEL: test_compress_v2f64:
230 ; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
231 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
232 ; AVX512VL-NEXT: vcompresspd %xmm0, %xmm2 {%k1}
233 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
234 ; AVX512VL-NEXT: retq
235 %out = call <2 x double> @llvm.experimental.vector.compress(<2 x double> %vec, <2 x i1> %mask, <2 x double> %passthru)
236 ret <2 x double> %out
239 define <8 x i32> @test_compress_v8i32(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru) {
240 ; AVX2-LABEL: test_compress_v8i32:
242 ; AVX2-NEXT: pushq %rbp
243 ; AVX2-NEXT: .cfi_def_cfa_offset 16
244 ; AVX2-NEXT: .cfi_offset %rbp, -16
245 ; AVX2-NEXT: movq %rsp, %rbp
246 ; AVX2-NEXT: .cfi_def_cfa_register %rbp
247 ; AVX2-NEXT: pushq %rbx
248 ; AVX2-NEXT: andq $-32, %rsp
249 ; AVX2-NEXT: subq $64, %rsp
250 ; AVX2-NEXT: .cfi_offset %rbx, -24
251 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
252 ; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
253 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm3
254 ; AVX2-NEXT: vmovaps %ymm2, (%rsp)
255 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1
256 ; AVX2-NEXT: vpackssdw %xmm1, %xmm3, %xmm2
257 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
258 ; AVX2-NEXT: vpslld $31, %ymm2, %ymm2
259 ; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2
260 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
261 ; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
262 ; AVX2-NEXT: vpextrd $1, %xmm2, %eax
263 ; AVX2-NEXT: vmovd %xmm2, %ecx
264 ; AVX2-NEXT: addl %eax, %ecx
265 ; AVX2-NEXT: vpextrd $2, %xmm2, %edx
266 ; AVX2-NEXT: vpextrd $3, %xmm2, %eax
267 ; AVX2-NEXT: addl %edx, %eax
268 ; AVX2-NEXT: addl %ecx, %eax
269 ; AVX2-NEXT: andl $7, %eax
270 ; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
271 ; AVX2-NEXT: andl $1, %ecx
272 ; AVX2-NEXT: vmovd %xmm3, %edx
273 ; AVX2-NEXT: andl $1, %edx
274 ; AVX2-NEXT: addq %rdx, %rcx
275 ; AVX2-NEXT: vpextrd $2, %xmm3, %esi
276 ; AVX2-NEXT: andl $1, %esi
277 ; AVX2-NEXT: addq %rcx, %rsi
278 ; AVX2-NEXT: vpextrd $3, %xmm3, %edi
279 ; AVX2-NEXT: andl $1, %edi
280 ; AVX2-NEXT: addq %rsi, %rdi
281 ; AVX2-NEXT: vmovd %xmm1, %r8d
282 ; AVX2-NEXT: andl $1, %r8d
283 ; AVX2-NEXT: addq %rdi, %r8
284 ; AVX2-NEXT: vpextrd $1, %xmm1, %r9d
285 ; AVX2-NEXT: andl $1, %r9d
286 ; AVX2-NEXT: addq %r8, %r9
287 ; AVX2-NEXT: vpextrd $2, %xmm1, %r10d
288 ; AVX2-NEXT: andl $1, %r10d
289 ; AVX2-NEXT: addq %r9, %r10
290 ; AVX2-NEXT: vpextrd $3, %xmm1, %r11d
291 ; AVX2-NEXT: andl $1, %r11d
292 ; AVX2-NEXT: addq %r10, %r11
293 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
294 ; AVX2-NEXT: vextractps $3, %xmm1, %ebx
295 ; AVX2-NEXT: cmpq $8, %r11
296 ; AVX2-NEXT: cmovbl (%rsp,%rax,4), %ebx
297 ; AVX2-NEXT: vmovss %xmm0, (%rsp)
298 ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rdx,4)
299 ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
300 ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rsi,4)
301 ; AVX2-NEXT: andl $7, %edi
302 ; AVX2-NEXT: vmovss %xmm1, (%rsp,%rdi,4)
303 ; AVX2-NEXT: andl $7, %r8d
304 ; AVX2-NEXT: vextractps $1, %xmm1, (%rsp,%r8,4)
305 ; AVX2-NEXT: andl $7, %r9d
306 ; AVX2-NEXT: vextractps $2, %xmm1, (%rsp,%r9,4)
307 ; AVX2-NEXT: andl $7, %r10d
308 ; AVX2-NEXT: vextractps $3, %xmm1, (%rsp,%r10,4)
309 ; AVX2-NEXT: cmpq $7, %r11
310 ; AVX2-NEXT: movl $7, %eax
311 ; AVX2-NEXT: cmovbq %r11, %rax
312 ; AVX2-NEXT: movl %eax, %eax
313 ; AVX2-NEXT: movl %ebx, (%rsp,%rax,4)
314 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
315 ; AVX2-NEXT: leaq -8(%rbp), %rsp
316 ; AVX2-NEXT: popq %rbx
317 ; AVX2-NEXT: popq %rbp
318 ; AVX2-NEXT: .cfi_def_cfa %rsp, 8
321 ; AVX512F-LABEL: test_compress_v8i32:
323 ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
324 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
325 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
326 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
327 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
328 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1}
329 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
332 ; AVX512VL-LABEL: test_compress_v8i32:
334 ; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
335 ; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
336 ; AVX512VL-NEXT: vpcompressd %ymm0, %ymm2 {%k1}
337 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
338 ; AVX512VL-NEXT: retq
339 %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> %passthru)
343 define <8 x float> @test_compress_v8f32(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru) {
344 ; AVX2-LABEL: test_compress_v8f32:
346 ; AVX2-NEXT: pushq %rbp
347 ; AVX2-NEXT: .cfi_def_cfa_offset 16
348 ; AVX2-NEXT: .cfi_offset %rbp, -16
349 ; AVX2-NEXT: movq %rsp, %rbp
350 ; AVX2-NEXT: .cfi_def_cfa_register %rbp
351 ; AVX2-NEXT: andq $-32, %rsp
352 ; AVX2-NEXT: subq $64, %rsp
353 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
354 ; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
355 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm3
356 ; AVX2-NEXT: vmovaps %ymm2, (%rsp)
357 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1
358 ; AVX2-NEXT: vpackssdw %xmm1, %xmm3, %xmm2
359 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
360 ; AVX2-NEXT: vpslld $31, %ymm2, %ymm2
361 ; AVX2-NEXT: vpsrld $31, %ymm2, %ymm2
362 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
363 ; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm2
364 ; AVX2-NEXT: vpextrd $1, %xmm2, %eax
365 ; AVX2-NEXT: vmovd %xmm2, %ecx
366 ; AVX2-NEXT: addl %eax, %ecx
367 ; AVX2-NEXT: vpextrd $2, %xmm2, %eax
368 ; AVX2-NEXT: vpextrd $3, %xmm2, %edx
369 ; AVX2-NEXT: addl %eax, %edx
370 ; AVX2-NEXT: addl %ecx, %edx
371 ; AVX2-NEXT: andl $7, %edx
372 ; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
373 ; AVX2-NEXT: vmovss %xmm0, (%rsp)
374 ; AVX2-NEXT: vmovd %xmm3, %eax
375 ; AVX2-NEXT: andl $1, %eax
376 ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4)
377 ; AVX2-NEXT: vpextrd $1, %xmm3, %ecx
378 ; AVX2-NEXT: andl $1, %ecx
379 ; AVX2-NEXT: addq %rax, %rcx
380 ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
381 ; AVX2-NEXT: vpextrd $2, %xmm3, %eax
382 ; AVX2-NEXT: andl $1, %eax
383 ; AVX2-NEXT: addq %rcx, %rax
384 ; AVX2-NEXT: vextractps $3, %xmm0, (%rsp,%rax,4)
385 ; AVX2-NEXT: vpextrd $3, %xmm3, %ecx
386 ; AVX2-NEXT: andl $1, %ecx
387 ; AVX2-NEXT: addq %rax, %rcx
388 ; AVX2-NEXT: vmovd %xmm1, %eax
389 ; AVX2-NEXT: andl $1, %eax
390 ; AVX2-NEXT: addq %rcx, %rax
391 ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
392 ; AVX2-NEXT: andl $7, %ecx
393 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
394 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rcx,4)
395 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
396 ; AVX2-NEXT: andl $1, %ecx
397 ; AVX2-NEXT: addq %rax, %rcx
398 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
399 ; AVX2-NEXT: andl $7, %eax
400 ; AVX2-NEXT: vextractps $1, %xmm0, (%rsp,%rax,4)
401 ; AVX2-NEXT: vpextrd $2, %xmm1, %edx
402 ; AVX2-NEXT: andl $1, %edx
403 ; AVX2-NEXT: addq %rcx, %rdx
404 ; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
405 ; AVX2-NEXT: andl $7, %ecx
406 ; AVX2-NEXT: vextractps $2, %xmm0, (%rsp,%rcx,4)
407 ; AVX2-NEXT: vpextrd $3, %xmm1, %eax
408 ; AVX2-NEXT: andl $1, %eax
409 ; AVX2-NEXT: addq %rdx, %rax
410 ; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
411 ; AVX2-NEXT: andl $7, %edx
412 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
413 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rdx,4)
414 ; AVX2-NEXT: cmpq $8, %rax
415 ; AVX2-NEXT: jae .LBB5_2
416 ; AVX2-NEXT: # %bb.1:
417 ; AVX2-NEXT: vmovaps %xmm2, %xmm0
418 ; AVX2-NEXT: .LBB5_2:
419 ; AVX2-NEXT: cmpq $7, %rax
420 ; AVX2-NEXT: movl $7, %ecx
421 ; AVX2-NEXT: cmovbq %rax, %rcx
422 ; AVX2-NEXT: movl %ecx, %eax
423 ; AVX2-NEXT: vmovss %xmm0, (%rsp,%rax,4)
424 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
425 ; AVX2-NEXT: movq %rbp, %rsp
426 ; AVX2-NEXT: popq %rbp
427 ; AVX2-NEXT: .cfi_def_cfa %rsp, 8
430 ; AVX512F-LABEL: test_compress_v8f32:
432 ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
433 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
434 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
435 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
436 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
437 ; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1}
438 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
441 ; AVX512VL-LABEL: test_compress_v8f32:
443 ; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
444 ; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
445 ; AVX512VL-NEXT: vcompressps %ymm0, %ymm2 {%k1}
446 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
447 ; AVX512VL-NEXT: retq
448 %out = call <8 x float> @llvm.experimental.vector.compress(<8 x float> %vec, <8 x i1> %mask, <8 x float> %passthru)
452 define <4 x i64> @test_compress_v4i64(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru) {
453 ; AVX2-LABEL: test_compress_v4i64:
455 ; AVX2-NEXT: pushq %rbp
456 ; AVX2-NEXT: .cfi_def_cfa_offset 16
457 ; AVX2-NEXT: .cfi_offset %rbp, -16
458 ; AVX2-NEXT: movq %rsp, %rbp
459 ; AVX2-NEXT: .cfi_def_cfa_register %rbp
460 ; AVX2-NEXT: andq $-32, %rsp
461 ; AVX2-NEXT: subq $64, %rsp
462 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
463 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
464 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
465 ; AVX2-NEXT: vmovaps %ymm2, (%rsp)
466 ; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm2
467 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
468 ; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
469 ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
470 ; AVX2-NEXT: vmovq %xmm2, %rax
471 ; AVX2-NEXT: addl %ecx, %eax
472 ; AVX2-NEXT: andl $3, %eax
473 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
474 ; AVX2-NEXT: vmovq %xmm1, %rdx
475 ; AVX2-NEXT: andl $1, %edx
476 ; AVX2-NEXT: movl %edx, %esi
477 ; AVX2-NEXT: subq %rcx, %rdx
478 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
479 ; AVX2-NEXT: vmovq %xmm1, %rcx
480 ; AVX2-NEXT: movl %edx, %edi
481 ; AVX2-NEXT: subq %rcx, %rdx
482 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
483 ; AVX2-NEXT: movq %rdx, %r8
484 ; AVX2-NEXT: subq %rcx, %r8
485 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
486 ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
487 ; AVX2-NEXT: cmpq $4, %r8
488 ; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rcx
489 ; AVX2-NEXT: vmovq %xmm0, (%rsp)
490 ; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%rsi,8)
491 ; AVX2-NEXT: vmovq %xmm1, (%rsp,%rdi,8)
492 ; AVX2-NEXT: andl $3, %edx
493 ; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%rdx,8)
494 ; AVX2-NEXT: cmpq $3, %r8
495 ; AVX2-NEXT: movl $3, %eax
496 ; AVX2-NEXT: cmovbq %r8, %rax
497 ; AVX2-NEXT: movl %eax, %eax
498 ; AVX2-NEXT: movq %rcx, (%rsp,%rax,8)
499 ; AVX2-NEXT: vmovaps (%rsp), %ymm0
500 ; AVX2-NEXT: movq %rbp, %rsp
501 ; AVX2-NEXT: popq %rbp
502 ; AVX2-NEXT: .cfi_def_cfa %rsp, 8
505 ; AVX512F-LABEL: test_compress_v4i64:
507 ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
508 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
509 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
510 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
511 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
512 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
513 ; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
514 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
517 ; AVX512VL-LABEL: test_compress_v4i64:
519 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
520 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
521 ; AVX512VL-NEXT: vpcompressq %ymm0, %ymm2 {%k1}
522 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
523 ; AVX512VL-NEXT: retq
524 %out = call <4 x i64> @llvm.experimental.vector.compress(<4 x i64> %vec, <4 x i1> %mask, <4 x i64> %passthru)
528 define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru) {
529 ; AVX512F-LABEL: test_compress_v4f64:
531 ; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
532 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
533 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
534 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
535 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
536 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
537 ; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1}
538 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
541 ; AVX512VL-LABEL: test_compress_v4f64:
543 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
544 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
545 ; AVX512VL-NEXT: vcompresspd %ymm0, %ymm2 {%k1}
546 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
547 ; AVX512VL-NEXT: retq
548 %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> %passthru)
549 ret <4 x double> %out
552 define <16 x i32> @test_compress_v16i32(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru) {
553 ; AVX512F-LABEL: test_compress_v16i32:
555 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
556 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
557 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
558 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm2 {%k1}
559 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
562 ; AVX512VL-LABEL: test_compress_v16i32:
564 ; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1
565 ; AVX512VL-NEXT: vpmovb2m %xmm1, %k1
566 ; AVX512VL-NEXT: vpcompressd %zmm0, %zmm2 {%k1}
567 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
568 ; AVX512VL-NEXT: retq
569 %out = call <16 x i32> @llvm.experimental.vector.compress(<16 x i32> %vec, <16 x i1> %mask, <16 x i32> %passthru)
573 define <16 x float> @test_compress_v16f32(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru) {
574 ; AVX512F-LABEL: test_compress_v16f32:
576 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
577 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
578 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
579 ; AVX512F-NEXT: vcompressps %zmm0, %zmm2 {%k1}
580 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
583 ; AVX512VL-LABEL: test_compress_v16f32:
585 ; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1
586 ; AVX512VL-NEXT: vpmovb2m %xmm1, %k1
587 ; AVX512VL-NEXT: vcompressps %zmm0, %zmm2 {%k1}
588 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
589 ; AVX512VL-NEXT: retq
590 %out = call <16 x float> @llvm.experimental.vector.compress(<16 x float> %vec, <16 x i1> %mask, <16 x float> %passthru)
591 ret <16 x float> %out
594 define <8 x i64> @test_compress_v8i64(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru) {
595 ; AVX512F-LABEL: test_compress_v8i64:
597 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
598 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
599 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
600 ; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
601 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
604 ; AVX512VL-LABEL: test_compress_v8i64:
606 ; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
607 ; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
608 ; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
609 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
610 ; AVX512VL-NEXT: retq
611 %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %vec, <8 x i1> %mask, <8 x i64> %passthru)
615 define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru) {
616 ; AVX512F-LABEL: test_compress_v8f64:
618 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
619 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
620 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
621 ; AVX512F-NEXT: vcompresspd %zmm0, %zmm2 {%k1}
622 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
625 ; AVX512VL-LABEL: test_compress_v8f64:
627 ; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
628 ; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
629 ; AVX512VL-NEXT: vcompresspd %zmm0, %zmm2 {%k1}
630 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
631 ; AVX512VL-NEXT: retq
632 %out = call <8 x double> @llvm.experimental.vector.compress(<8 x double> %vec, <8 x i1> %mask, <8 x double> %passthru)
633 ret <8 x double> %out
636 define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru) {
637 ; AVX512F-LABEL: test_compress_v16i8:
639 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
640 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
641 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
642 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
643 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
644 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
645 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm0
646 ; AVX512F-NEXT: vzeroupper
649 ; AVX512VL-LABEL: test_compress_v16i8:
651 ; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1
652 ; AVX512VL-NEXT: vpmovb2m %xmm1, %k1
653 ; AVX512VL-NEXT: vpcompressb %xmm0, %xmm2 {%k1}
654 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
655 ; AVX512VL-NEXT: retq
656 %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> %passthru)
660 define <8 x i16> @test_compress_v8i16(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru) {
661 ; AVX512F-LABEL: test_compress_v8i16:
663 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
664 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
665 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
666 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
667 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
668 ; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
669 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm0
670 ; AVX512F-NEXT: vzeroupper
673 ; AVX512VL-LABEL: test_compress_v8i16:
675 ; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1
676 ; AVX512VL-NEXT: vpmovw2m %xmm1, %k1
677 ; AVX512VL-NEXT: vpcompressw %xmm0, %xmm2 {%k1}
678 ; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
679 ; AVX512VL-NEXT: retq
680 %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> %passthru)
684 define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru) {
685 ; AVX512F-LABEL: test_compress_v32i8:
687 ; AVX512F-NEXT: pushq %rbp
688 ; AVX512F-NEXT: .cfi_def_cfa_offset 16
689 ; AVX512F-NEXT: .cfi_offset %rbp, -16
690 ; AVX512F-NEXT: movq %rsp, %rbp
691 ; AVX512F-NEXT: .cfi_def_cfa_register %rbp
692 ; AVX512F-NEXT: andq $-32, %rsp
693 ; AVX512F-NEXT: subq $64, %rsp
694 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
695 ; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
696 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
697 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
698 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm3
699 ; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
700 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k2
701 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
702 ; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k2} {z}
703 ; AVX512F-NEXT: vpmovdb %zmm3, (%rsp)
704 ; AVX512F-NEXT: kshiftrw $8, %k2, %k0
705 ; AVX512F-NEXT: kxorw %k0, %k2, %k0
706 ; AVX512F-NEXT: kshiftrw $4, %k0, %k2
707 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
708 ; AVX512F-NEXT: kshiftrw $2, %k0, %k2
709 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
710 ; AVX512F-NEXT: kshiftrw $1, %k0, %k2
711 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
712 ; AVX512F-NEXT: kmovw %k0, %eax
713 ; AVX512F-NEXT: andl $31, %eax
714 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
715 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
716 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
717 ; AVX512F-NEXT: vpmovdb %zmm0, (%rsp,%rax)
718 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm0
719 ; AVX512F-NEXT: vpblendvb %ymm0, (%rsp), %ymm2, %ymm0
720 ; AVX512F-NEXT: movq %rbp, %rsp
721 ; AVX512F-NEXT: popq %rbp
722 ; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
725 ; AVX512VL-LABEL: test_compress_v32i8:
727 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
728 ; AVX512VL-NEXT: vpmovb2m %ymm1, %k1
729 ; AVX512VL-NEXT: vpcompressb %ymm0, %ymm2 {%k1}
730 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
731 ; AVX512VL-NEXT: retq
732 %out = call <32 x i8> @llvm.experimental.vector.compress(<32 x i8> %vec, <32 x i1> %mask, <32 x i8> %passthru)
736 define <16 x i16> @test_compress_v16i16(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru) {
737 ; AVX512F-LABEL: test_compress_v16i16:
739 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
740 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
741 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
742 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
743 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
744 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm1 {%k1}
745 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm0
748 ; AVX512VL-LABEL: test_compress_v16i16:
750 ; AVX512VL-NEXT: vpsllw $7, %xmm1, %xmm1
751 ; AVX512VL-NEXT: vpmovb2m %xmm1, %k1
752 ; AVX512VL-NEXT: vpcompressw %ymm0, %ymm2 {%k1}
753 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
754 ; AVX512VL-NEXT: retq
755 %out = call <16 x i16> @llvm.experimental.vector.compress(<16 x i16> %vec, <16 x i1> %mask, <16 x i16> %passthru)
759 define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru) {
760 ; AVX512VL-LABEL: test_compress_v64i8:
762 ; AVX512VL-NEXT: vpsllw $7, %zmm1, %zmm1
763 ; AVX512VL-NEXT: vpmovb2m %zmm1, %k1
764 ; AVX512VL-NEXT: vpcompressb %zmm0, %zmm2 {%k1}
765 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
766 ; AVX512VL-NEXT: retq
767 %out = call <64 x i8> @llvm.experimental.vector.compress(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> %passthru)
771 define <32 x i16> @test_compress_v32i16(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru) {
772 ; AVX512F-LABEL: test_compress_v32i16:
774 ; AVX512F-NEXT: pushq %rbp
775 ; AVX512F-NEXT: .cfi_def_cfa_offset 16
776 ; AVX512F-NEXT: .cfi_offset %rbp, -16
777 ; AVX512F-NEXT: movq %rsp, %rbp
778 ; AVX512F-NEXT: .cfi_def_cfa_register %rbp
779 ; AVX512F-NEXT: andq $-64, %rsp
780 ; AVX512F-NEXT: subq $128, %rsp
781 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
782 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
783 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
784 ; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
785 ; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
786 ; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
787 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
788 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
789 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2
790 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
791 ; AVX512F-NEXT: vpcompressd %zmm1, %zmm1 {%k2} {z}
792 ; AVX512F-NEXT: vpmovdw %zmm1, (%rsp)
793 ; AVX512F-NEXT: kshiftrw $8, %k2, %k0
794 ; AVX512F-NEXT: kxorw %k0, %k2, %k0
795 ; AVX512F-NEXT: kshiftrw $4, %k0, %k2
796 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
797 ; AVX512F-NEXT: kshiftrw $2, %k0, %k2
798 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
799 ; AVX512F-NEXT: kshiftrw $1, %k0, %k2
800 ; AVX512F-NEXT: kxorw %k2, %k0, %k0
801 ; AVX512F-NEXT: kmovw %k0, %eax
802 ; AVX512F-NEXT: andl $31, %eax
803 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
804 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
805 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
806 ; AVX512F-NEXT: vpmovdw %zmm0, (%rsp,%rax,2)
807 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
808 ; AVX512F-NEXT: vpsllw $15, %ymm4, %ymm1
809 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
810 ; AVX512F-NEXT: vpblendvb %ymm1, {{[0-9]+}}(%rsp), %ymm0, %ymm0
811 ; AVX512F-NEXT: vpsllw $15, %ymm3, %ymm1
812 ; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
813 ; AVX512F-NEXT: vpblendvb %ymm1, (%rsp), %ymm2, %ymm1
814 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
815 ; AVX512F-NEXT: movq %rbp, %rsp
816 ; AVX512F-NEXT: popq %rbp
817 ; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
820 ; AVX512VL-LABEL: test_compress_v32i16:
822 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
823 ; AVX512VL-NEXT: vpmovb2m %ymm1, %k1
824 ; AVX512VL-NEXT: vpcompressw %zmm0, %zmm2 {%k1}
825 ; AVX512VL-NEXT: vmovdqa64 %zmm2, %zmm0
826 ; AVX512VL-NEXT: retq
827 %out = call <32 x i16> @llvm.experimental.vector.compress(<32 x i16> %vec, <32 x i1> %mask, <32 x i16> %passthru)
831 define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i32> %passthru) {
832 ; AVX512VL-LABEL: test_compress_large:
834 ; AVX512VL-NEXT: pushq %rbp
835 ; AVX512VL-NEXT: .cfi_def_cfa_offset 16
836 ; AVX512VL-NEXT: .cfi_offset %rbp, -16
837 ; AVX512VL-NEXT: movq %rsp, %rbp
838 ; AVX512VL-NEXT: .cfi_def_cfa_register %rbp
839 ; AVX512VL-NEXT: andq $-64, %rsp
840 ; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
841 ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
842 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
843 ; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
844 ; AVX512VL-NEXT: kshiftrd $16, %k4, %k3
845 ; AVX512VL-NEXT: kshiftrd $16, %k1, %k2
846 ; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
847 ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
848 ; AVX512VL-NEXT: kshiftrw $8, %k1, %k0
849 ; AVX512VL-NEXT: kxorw %k0, %k1, %k0
850 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
851 ; AVX512VL-NEXT: kxorw %k5, %k0, %k0
852 ; AVX512VL-NEXT: kshiftrw $2, %k0, %k5
853 ; AVX512VL-NEXT: kxorw %k5, %k0, %k0
854 ; AVX512VL-NEXT: kshiftrw $1, %k0, %k5
855 ; AVX512VL-NEXT: kxorw %k5, %k0, %k0
856 ; AVX512VL-NEXT: kmovd %k0, %eax
857 ; AVX512VL-NEXT: andl $31, %eax
858 ; AVX512VL-NEXT: vpcompressd %zmm2, %zmm0 {%k2} {z}
859 ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
860 ; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
861 ; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
862 ; AVX512VL-NEXT: kshiftrw $8, %k4, %k0
863 ; AVX512VL-NEXT: kxorw %k0, %k4, %k0
864 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
865 ; AVX512VL-NEXT: kxorw %k4, %k0, %k0
866 ; AVX512VL-NEXT: kshiftrw $2, %k0, %k4
867 ; AVX512VL-NEXT: kxorw %k4, %k0, %k0
868 ; AVX512VL-NEXT: kshiftrw $1, %k0, %k4
869 ; AVX512VL-NEXT: kxorw %k4, %k0, %k0
870 ; AVX512VL-NEXT: kmovd %k0, %eax
871 ; AVX512VL-NEXT: andl $31, %eax
872 ; AVX512VL-NEXT: vpcompressd %zmm4, %zmm0 {%k3} {z}
873 ; AVX512VL-NEXT: vmovdqa64 %zmm0, 128(%rsp,%rax,4)
874 ; AVX512VL-NEXT: vmovaps (%rsp), %zmm0
875 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
876 ; AVX512VL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
877 ; AVX512VL-NEXT: kxorw %k2, %k1, %k0
878 ; AVX512VL-NEXT: kshiftrw $8, %k0, %k1
879 ; AVX512VL-NEXT: kxorw %k1, %k0, %k0
880 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k1
881 ; AVX512VL-NEXT: kxorw %k1, %k0, %k0
882 ; AVX512VL-NEXT: kshiftrw $2, %k0, %k1
883 ; AVX512VL-NEXT: kxorw %k1, %k0, %k0
884 ; AVX512VL-NEXT: kshiftrw $1, %k0, %k1
885 ; AVX512VL-NEXT: kxorw %k1, %k0, %k0
886 ; AVX512VL-NEXT: kmovd %k0, %eax
887 ; AVX512VL-NEXT: andl $63, %eax
888 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
889 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
890 ; AVX512VL-NEXT: vmovaps %zmm0, 256(%rsp,%rax,4)
891 ; AVX512VL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
892 ; AVX512VL-NEXT: vmovaps %zmm2, 320(%rsp,%rax,4)
893 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
894 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1
895 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2
896 ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm3
897 ; AVX512VL-NEXT: movq %rbp, %rsp
898 ; AVX512VL-NEXT: popq %rbp
899 ; AVX512VL-NEXT: .cfi_def_cfa %rsp, 8
900 ; AVX512VL-NEXT: retq
901 %out = call <64 x i32> @llvm.experimental.vector.compress(<64 x i32> %vec, <64 x i1> %mask, <64 x i32> undef)
905 define <4 x i32> @test_compress_all_const() {
906 ; CHECK-LABEL: test_compress_all_const:
908 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [5,9,0,0]
910 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
911 <4 x i1> <i1 0, i1 1, i1 0, i1 1>,
916 define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
917 ; CHECK-LABEL: test_compress_const_mask:
919 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
921 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
925 define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
926 ; CHECK-LABEL: test_compress_const_mask_passthrough:
928 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,3]
930 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
934 define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
935 ; CHECK-LABEL: test_compress_const_mask_const_passthrough:
937 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
938 ; CHECK-NEXT: movl $7, %eax
939 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
940 ; CHECK-NEXT: movl $8, %eax
941 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
943 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
947 ; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
948 ; the second vector input register to the return register or doing nothing.
949 define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) {
950 ; CHECK-LABEL: test_compress_const_splat1_mask:
952 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
954 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
957 define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
958 ; CHECK-LABEL: test_compress_const_splat0_mask:
961 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
964 define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
965 ; CHECK-LABEL: test_compress_undef_mask:
968 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
971 define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
972 ; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
974 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
976 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
979 define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
980 ; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
983 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
987 define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
988 ; AVX512F-LABEL: test_compress_small:
990 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
991 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
992 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
993 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
994 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
995 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
996 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
997 ; AVX512F-NEXT: vzeroupper
1000 ; AVX512VL-LABEL: test_compress_small:
1001 ; AVX512VL: # %bb.0:
1002 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
1003 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
1004 ; AVX512VL-NEXT: vpcompressb %xmm0, %xmm0 {%k1} {z}
1005 ; AVX512VL-NEXT: retq
1006 %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
1010 define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
1011 ; AVX2-LABEL: test_compress_illegal_element_type:
1013 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
1014 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
1015 ; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
1016 ; AVX2-NEXT: vmovd %xmm1, %eax
1017 ; AVX2-NEXT: andl $1, %eax
1018 ; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4)
1019 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
1020 ; AVX2-NEXT: subl %ecx, %eax
1021 ; AVX2-NEXT: leal (,%rax,4), %ecx
1022 ; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx)
1023 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
1024 ; AVX2-NEXT: subl %ecx, %eax
1025 ; AVX2-NEXT: andl $3, %eax
1026 ; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4)
1027 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1030 ; AVX512F-LABEL: test_compress_illegal_element_type:
1032 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1033 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
1034 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
1035 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1036 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1037 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
1038 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1039 ; AVX512F-NEXT: vzeroupper
1040 ; AVX512F-NEXT: retq
1042 ; AVX512VL-LABEL: test_compress_illegal_element_type:
1043 ; AVX512VL: # %bb.0:
1044 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
1045 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
1046 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z}
1047 ; AVX512VL-NEXT: retq
1048 %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
1052 define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
1053 ; AVX2-LABEL: test_compress_narrow:
1055 ; AVX2-NEXT: vmovd %edi, %xmm1
1056 ; AVX2-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
1057 ; AVX2-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
1058 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
1059 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
1060 ; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
1061 ; AVX2-NEXT: vmovd %xmm1, %eax
1062 ; AVX2-NEXT: andl $1, %eax
1063 ; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4)
1064 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
1065 ; AVX2-NEXT: subl %ecx, %eax
1066 ; AVX2-NEXT: leal (,%rax,4), %ecx
1067 ; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx)
1068 ; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
1069 ; AVX2-NEXT: subl %ecx, %eax
1070 ; AVX2-NEXT: andl $3, %eax
1071 ; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4)
1072 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1075 ; AVX512F-LABEL: test_compress_narrow:
1077 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1078 ; AVX512F-NEXT: andl $1, %edi
1079 ; AVX512F-NEXT: kmovw %edi, %k0
1080 ; AVX512F-NEXT: kmovw %esi, %k1
1081 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1
1082 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1
1083 ; AVX512F-NEXT: korw %k1, %k0, %k0
1084 ; AVX512F-NEXT: movw $-5, %ax
1085 ; AVX512F-NEXT: kmovw %eax, %k1
1086 ; AVX512F-NEXT: kandw %k1, %k0, %k0
1087 ; AVX512F-NEXT: kmovw %edx, %k1
1088 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1
1089 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1
1090 ; AVX512F-NEXT: korw %k1, %k0, %k0
1091 ; AVX512F-NEXT: movb $7, %al
1092 ; AVX512F-NEXT: kmovw %eax, %k1
1093 ; AVX512F-NEXT: kandw %k1, %k0, %k0
1094 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1095 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1096 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
1097 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1098 ; AVX512F-NEXT: vzeroupper
1099 ; AVX512F-NEXT: retq
1101 ; AVX512VL-LABEL: test_compress_narrow:
1102 ; AVX512VL: # %bb.0:
1103 ; AVX512VL-NEXT: andl $1, %edi
1104 ; AVX512VL-NEXT: kmovw %edi, %k0
1105 ; AVX512VL-NEXT: kmovd %esi, %k1
1106 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1
1107 ; AVX512VL-NEXT: kshiftrw $14, %k1, %k1
1108 ; AVX512VL-NEXT: korw %k1, %k0, %k0
1109 ; AVX512VL-NEXT: movw $-5, %ax
1110 ; AVX512VL-NEXT: kmovd %eax, %k1
1111 ; AVX512VL-NEXT: kandw %k1, %k0, %k0
1112 ; AVX512VL-NEXT: kmovd %edx, %k1
1113 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1
1114 ; AVX512VL-NEXT: kshiftrw $13, %k1, %k1
1115 ; AVX512VL-NEXT: korw %k1, %k0, %k0
1116 ; AVX512VL-NEXT: movb $7, %al
1117 ; AVX512VL-NEXT: kmovd %eax, %k1
1118 ; AVX512VL-NEXT: kandw %k1, %k0, %k1
1119 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z}
1120 ; AVX512VL-NEXT: retq
1121 %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
1125 define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) {
1126 ; AVX2-LABEL: test_compress_narrow_illegal_element_type:
1128 ; AVX2-NEXT: vmovd %ecx, %xmm0
1129 ; AVX2-NEXT: vpinsrd $1, %r8d, %xmm0, %xmm0
1130 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
1131 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
1132 ; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
1133 ; AVX2-NEXT: vmovd %xmm0, %eax
1134 ; AVX2-NEXT: andl $1, %eax
1135 ; AVX2-NEXT: movl %esi, -24(%rsp,%rax,4)
1136 ; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
1137 ; AVX2-NEXT: subl %ecx, %eax
1138 ; AVX2-NEXT: shll $2, %eax
1139 ; AVX2-NEXT: movl %edx, -24(%rsp,%rax)
1140 ; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
1141 ; AVX2-NEXT: vmovd %xmm0, %eax
1142 ; AVX2-NEXT: vpextrb $4, %xmm0, %edx
1143 ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
1144 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1145 ; AVX2-NEXT: # kill: def $dl killed $dl killed $edx
1146 ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
1149 ; AVX512F-LABEL: test_compress_narrow_illegal_element_type:
1151 ; AVX512F-NEXT: andl $1, %ecx
1152 ; AVX512F-NEXT: kmovw %ecx, %k0
1153 ; AVX512F-NEXT: kmovw %r8d, %k1
1154 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1
1155 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1
1156 ; AVX512F-NEXT: korw %k1, %k0, %k0
1157 ; AVX512F-NEXT: movw $-5, %ax
1158 ; AVX512F-NEXT: kmovw %eax, %k1
1159 ; AVX512F-NEXT: kandw %k1, %k0, %k0
1160 ; AVX512F-NEXT: kmovw %r9d, %k1
1161 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1
1162 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1
1163 ; AVX512F-NEXT: korw %k1, %k0, %k0
1164 ; AVX512F-NEXT: movb $7, %al
1165 ; AVX512F-NEXT: kmovw %eax, %k1
1166 ; AVX512F-NEXT: kandw %k1, %k0, %k0
1167 ; AVX512F-NEXT: vmovd %edi, %xmm0
1168 ; AVX512F-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
1169 ; AVX512F-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
1170 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1171 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1172 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
1173 ; AVX512F-NEXT: vmovd %xmm0, %eax
1174 ; AVX512F-NEXT: vpextrb $4, %xmm0, %edx
1175 ; AVX512F-NEXT: vpextrb $8, %xmm0, %ecx
1176 ; AVX512F-NEXT: # kill: def $al killed $al killed $eax
1177 ; AVX512F-NEXT: # kill: def $dl killed $dl killed $edx
1178 ; AVX512F-NEXT: # kill: def $cl killed $cl killed $ecx
1179 ; AVX512F-NEXT: vzeroupper
1180 ; AVX512F-NEXT: retq
1182 ; AVX512VL-LABEL: test_compress_narrow_illegal_element_type:
1183 ; AVX512VL: # %bb.0:
1184 ; AVX512VL-NEXT: andl $1, %ecx
1185 ; AVX512VL-NEXT: kmovw %ecx, %k0
1186 ; AVX512VL-NEXT: kmovd %r8d, %k1
1187 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1
1188 ; AVX512VL-NEXT: kshiftrw $14, %k1, %k1
1189 ; AVX512VL-NEXT: korw %k1, %k0, %k0
1190 ; AVX512VL-NEXT: movw $-5, %ax
1191 ; AVX512VL-NEXT: kmovd %eax, %k1
1192 ; AVX512VL-NEXT: kandw %k1, %k0, %k0
1193 ; AVX512VL-NEXT: kmovd %r9d, %k1
1194 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1
1195 ; AVX512VL-NEXT: kshiftrw $13, %k1, %k1
1196 ; AVX512VL-NEXT: korw %k1, %k0, %k0
1197 ; AVX512VL-NEXT: movb $7, %al
1198 ; AVX512VL-NEXT: kmovd %eax, %k1
1199 ; AVX512VL-NEXT: kandw %k1, %k0, %k1
1200 ; AVX512VL-NEXT: vmovd %edi, %xmm0
1201 ; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
1202 ; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
1203 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z}
1204 ; AVX512VL-NEXT: vmovd %xmm0, %eax
1205 ; AVX512VL-NEXT: vpextrb $4, %xmm0, %edx
1206 ; AVX512VL-NEXT: vpextrb $8, %xmm0, %ecx
1207 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
1208 ; AVX512VL-NEXT: # kill: def $dl killed $dl killed $edx
1209 ; AVX512VL-NEXT: # kill: def $cl killed $cl killed $ecx
1210 ; AVX512VL-NEXT: retq
1211 %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
1215 define <4 x i32> @test_compress_v4i32_zero_passthru(<4 x i32> %vec, <4 x i1> %mask) {
1216 ; AVX2-LABEL: test_compress_v4i32_zero_passthru:
1218 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
1219 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
1220 ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
1221 ; AVX2-NEXT: vmovaps %xmm2, -{{[0-9]+}}(%rsp)
1222 ; AVX2-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
1223 ; AVX2-NEXT: vmovd %xmm1, %eax
1224 ; AVX2-NEXT: andl $1, %eax
1225 ; AVX2-NEXT: vextractps $1, %xmm0, -24(%rsp,%rax,4)
1226 ; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
1227 ; AVX2-NEXT: andl $1, %ecx
1228 ; AVX2-NEXT: addq %rax, %rcx
1229 ; AVX2-NEXT: vextractps $2, %xmm0, -24(%rsp,%rcx,4)
1230 ; AVX2-NEXT: vpextrd $2, %xmm1, %eax
1231 ; AVX2-NEXT: andl $1, %eax
1232 ; AVX2-NEXT: addq %rcx, %rax
1233 ; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
1234 ; AVX2-NEXT: andl $1, %ecx
1235 ; AVX2-NEXT: addq %rax, %rcx
1236 ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
1237 ; AVX2-NEXT: andl $3, %eax
1238 ; AVX2-NEXT: vextractps $3, %xmm0, -24(%rsp,%rax,4)
1239 ; AVX2-NEXT: xorl %eax, %eax
1240 ; AVX2-NEXT: cmpq $3, %rcx
1241 ; AVX2-NEXT: movl $3, %edx
1242 ; AVX2-NEXT: cmovbq %rcx, %rdx
1243 ; AVX2-NEXT: vextractps $3, %xmm0, %ecx
1244 ; AVX2-NEXT: cmovbel %eax, %ecx
1245 ; AVX2-NEXT: movl %ecx, -24(%rsp,%rdx,4)
1246 ; AVX2-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
1249 ; AVX512F-LABEL: test_compress_v4i32_zero_passthru:
1251 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1252 ; AVX512F-NEXT: vpslld $31, %xmm1, %xmm1
1253 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
1254 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0
1255 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1
1256 ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} {z}
1257 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1258 ; AVX512F-NEXT: vzeroupper
1259 ; AVX512F-NEXT: retq
1261 ; AVX512VL-LABEL: test_compress_v4i32_zero_passthru:
1262 ; AVX512VL: # %bb.0:
1263 ; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
1264 ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
1265 ; AVX512VL-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z}
1266 ; AVX512VL-NEXT: retq
1267 %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> zeroinitializer)