1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
6 ; testz(~X,Y) -> testc(X,Y)
9 define i32 @ptestz_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
10 ; CHECK-LABEL: ptestz_128_invert0:
12 ; CHECK-NEXT: movl %edi, %eax
13 ; CHECK-NEXT: vptest %xmm1, %xmm0
14 ; CHECK-NEXT: cmovael %esi, %eax
16 %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
17 %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %d)
18 %t3 = icmp ne i32 %t2, 0
19 %t4 = select i1 %t3, i32 %a, i32 %b
23 define i32 @ptestz_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
24 ; CHECK-LABEL: ptestz_256_invert0:
26 ; CHECK-NEXT: movl %edi, %eax
27 ; CHECK-NEXT: vptest %ymm1, %ymm0
28 ; CHECK-NEXT: cmovael %esi, %eax
29 ; CHECK-NEXT: vzeroupper
31 %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
32 %t2 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t1, <4 x i64> %d)
33 %t3 = icmp ne i32 %t2, 0
34 %t4 = select i1 %t3, i32 %a, i32 %b
39 ; testz(X,~Y) -> testc(Y,X)
42 define i32 @ptestz_128_invert1(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
43 ; CHECK-LABEL: ptestz_128_invert1:
45 ; CHECK-NEXT: movl %edi, %eax
46 ; CHECK-NEXT: vptest %xmm0, %xmm1
47 ; CHECK-NEXT: cmovael %esi, %eax
49 %t1 = xor <2 x i64> %d, <i64 -1, i64 -1>
50 %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %t1)
51 %t3 = icmp ne i32 %t2, 0
52 %t4 = select i1 %t3, i32 %a, i32 %b
56 define i32 @ptestz_256_invert1(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
57 ; CHECK-LABEL: ptestz_256_invert1:
59 ; CHECK-NEXT: movl %edi, %eax
60 ; CHECK-NEXT: vptest %ymm0, %ymm1
61 ; CHECK-NEXT: cmovael %esi, %eax
62 ; CHECK-NEXT: vzeroupper
64 %t1 = xor <4 x i64> %d, <i64 -1, i64 -1, i64 -1, i64 -1>
65 %t2 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %c, <4 x i64> %t1)
66 %t3 = icmp ne i32 %t2, 0
67 %t4 = select i1 %t3, i32 %a, i32 %b
72 ; testc(~X,Y) -> testz(X,Y)
75 define i32 @ptestc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
76 ; CHECK-LABEL: ptestc_128_invert0:
78 ; CHECK-NEXT: movl %edi, %eax
79 ; CHECK-NEXT: vptest %xmm1, %xmm0
80 ; CHECK-NEXT: cmovnel %esi, %eax
82 %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
83 %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d)
84 %t3 = icmp ne i32 %t2, 0
85 %t4 = select i1 %t3, i32 %a, i32 %b
89 define i32 @ptestc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
90 ; CHECK-LABEL: ptestc_256_invert0:
92 ; CHECK-NEXT: movl %edi, %eax
93 ; CHECK-NEXT: vptest %ymm1, %ymm0
94 ; CHECK-NEXT: cmovnel %esi, %eax
95 ; CHECK-NEXT: vzeroupper
97 %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
98 %t2 = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %t1, <4 x i64> %d)
99 %t3 = icmp ne i32 %t2, 0
100 %t4 = select i1 %t3, i32 %a, i32 %b
105 ; testnzc(~X,Y) -> testnzc(X,Y)
108 define i32 @ptestnzc_128_invert0(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
109 ; CHECK-LABEL: ptestnzc_128_invert0:
111 ; CHECK-NEXT: movl %edi, %eax
112 ; CHECK-NEXT: vptest %xmm1, %xmm0
113 ; CHECK-NEXT: cmovnel %esi, %eax
115 %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
116 %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d)
117 %t3 = icmp ne i32 %t2, 0
118 %t4 = select i1 %t3, i32 %a, i32 %b
122 define i32 @ptestnzc_256_invert0(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
123 ; CHECK-LABEL: ptestnzc_256_invert0:
125 ; CHECK-NEXT: movl %edi, %eax
126 ; CHECK-NEXT: vptest %ymm1, %ymm0
127 ; CHECK-NEXT: cmovbel %esi, %eax
128 ; CHECK-NEXT: vzeroupper
130 %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
131 %t2 = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %t1, <4 x i64> %d)
132 %t3 = icmp ne i32 %t2, 0
133 %t4 = select i1 %t3, i32 %a, i32 %b
137 define i32 @ptestnzc_256_invert0_commute(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
138 ; CHECK-LABEL: ptestnzc_256_invert0_commute:
140 ; CHECK-NEXT: movl %edi, %eax
141 ; CHECK-NEXT: vptest %ymm1, %ymm0
142 ; CHECK-NEXT: cmoval %esi, %eax
143 ; CHECK-NEXT: vzeroupper
145 %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
146 %t2 = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %t1, <4 x i64> %d)
147 %t3 = icmp eq i32 %t2, 0
148 %t4 = select i1 %t3, i32 %a, i32 %b
153 ; testz(AND(X,Y),AND(X,Y)) -> testz(X,Y)
156 define i32 @ptestz_128_and(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
157 ; CHECK-LABEL: ptestz_128_and:
159 ; CHECK-NEXT: movl %edi, %eax
160 ; CHECK-NEXT: vptest %xmm1, %xmm0
161 ; CHECK-NEXT: cmovnel %esi, %eax
163 %t1 = and <2 x i64> %c, %d
164 %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t1)
165 %t3 = icmp ne i32 %t2, 0
166 %t4 = select i1 %t3, i32 %a, i32 %b
170 define i32 @ptestz_256_and(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
171 ; CHECK-LABEL: ptestz_256_and:
173 ; CHECK-NEXT: movl %edi, %eax
174 ; CHECK-NEXT: vptest %ymm1, %ymm0
175 ; CHECK-NEXT: cmovel %esi, %eax
176 ; CHECK-NEXT: vzeroupper
178 %t1 = and <4 x i64> %c, %d
179 %t2 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t1, <4 x i64> %t1)
180 %t3 = icmp eq i32 %t2, 0
181 %t4 = select i1 %t3, i32 %a, i32 %b
186 ; testz(AND(~X,Y),AND(~X,Y)) -> testc(X,Y)
189 define i32 @ptestz_128_andc(<2 x i64> %c, <2 x i64> %d, i32 %a, i32 %b) {
190 ; CHECK-LABEL: ptestz_128_andc:
192 ; CHECK-NEXT: movl %edi, %eax
193 ; CHECK-NEXT: vptest %xmm1, %xmm0
194 ; CHECK-NEXT: cmovael %esi, %eax
196 %t1 = xor <2 x i64> %c, <i64 -1, i64 -1>
197 %t2 = and <2 x i64> %t1, %d
198 %t3 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t2, <2 x i64> %t2)
199 %t4 = icmp ne i32 %t3, 0
200 %t5 = select i1 %t4, i32 %a, i32 %b
204 define i32 @ptestz_256_andc(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
205 ; CHECK-LABEL: ptestz_256_andc:
207 ; CHECK-NEXT: movl %edi, %eax
208 ; CHECK-NEXT: vptest %ymm1, %ymm0
209 ; CHECK-NEXT: cmovbl %esi, %eax
210 ; CHECK-NEXT: vzeroupper
212 %t1 = xor <4 x i64> %c, <i64 -1, i64 -1, i64 -1, i64 -1>
213 %t2 = and <4 x i64> %t1, %d
214 %t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> %t2)
215 %t4 = icmp eq i32 %t3, 0
216 %t5 = select i1 %t4, i32 %a, i32 %b
221 ; testz(-1,X) -> testz(X,X)
224 define i32 @ptestz_128_allones0(<2 x i64> %c, i32 %a, i32 %b) {
225 ; CHECK-LABEL: ptestz_128_allones0:
227 ; CHECK-NEXT: movl %edi, %eax
228 ; CHECK-NEXT: vptest %xmm0, %xmm0
229 ; CHECK-NEXT: cmovnel %esi, %eax
231 %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> <i64 -1, i64 -1>, <2 x i64> %c)
232 %t2 = icmp ne i32 %t1, 0
233 %t3 = select i1 %t2, i32 %a, i32 %b
237 define i32 @ptestz_256_allones0(<4 x i64> %c, i32 %a, i32 %b) {
238 ; CHECK-LABEL: ptestz_256_allones0:
240 ; CHECK-NEXT: movl %edi, %eax
241 ; CHECK-NEXT: vptest %ymm0, %ymm0
242 ; CHECK-NEXT: cmovnel %esi, %eax
243 ; CHECK-NEXT: vzeroupper
245 %t1 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> %c)
246 %t2 = icmp ne i32 %t1, 0
247 %t3 = select i1 %t2, i32 %a, i32 %b
252 ; testz(X,-1) -> testz(X,X)
255 define i32 @ptestz_128_allones1(<2 x i64> %c, i32 %a, i32 %b) {
256 ; CHECK-LABEL: ptestz_128_allones1:
258 ; CHECK-NEXT: movl %edi, %eax
259 ; CHECK-NEXT: vptest %xmm0, %xmm0
260 ; CHECK-NEXT: cmovnel %esi, %eax
262 %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> <i64 -1, i64 -1>)
263 %t2 = icmp ne i32 %t1, 0
264 %t3 = select i1 %t2, i32 %a, i32 %b
268 define i32 @ptestz_256_allones1(<4 x i64> %c, i32 %a, i32 %b) {
269 ; CHECK-LABEL: ptestz_256_allones1:
271 ; CHECK-NEXT: movl %edi, %eax
272 ; CHECK-NEXT: vptest %ymm0, %ymm0
273 ; CHECK-NEXT: cmovnel %esi, %eax
274 ; CHECK-NEXT: vzeroupper
276 %t1 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %c, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
277 %t2 = icmp ne i32 %t1, 0
278 %t3 = select i1 %t2, i32 %a, i32 %b
282 define zeroext i1 @PR38522(ptr %x, ptr %y) {
283 ; CHECK-LABEL: PR38522:
284 ; CHECK: # %bb.0: # %start
285 ; CHECK-NEXT: vmovdqa (%rdi), %xmm0
286 ; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
287 ; CHECK-NEXT: vptest %xmm0, %xmm0
288 ; CHECK-NEXT: sete %al
291 %0 = load <16 x i8>, ptr %x, align 16
292 %1 = load <16 x i8>, ptr %y, align 16
293 %2 = icmp sle <16 x i8> %0, %1
294 %3 = sext <16 x i1> %2 to <16 x i8>
295 %4 = bitcast <16 x i8> %3 to <2 x i64>
296 %5 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %4, <2 x i64> <i64 -1, i64 -1>)
297 %6 = icmp eq i32 %5, 1
302 ; testz(ashr(X,bw-1),-1) -> movmsk(X)
305 define i32 @ptestz_v2i64_signbits(<2 x i64> %c, i32 %a, i32 %b) {
306 ; CHECK-LABEL: ptestz_v2i64_signbits:
308 ; CHECK-NEXT: movl %edi, %eax
309 ; CHECK-NEXT: vmovmskpd %xmm0, %ecx
310 ; CHECK-NEXT: testl %ecx, %ecx
311 ; CHECK-NEXT: cmovnel %esi, %eax
313 %t1 = ashr <2 x i64> %c, <i64 63, i64 63>
314 %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> <i64 -1, i64 -1>)
315 %t3 = icmp ne i32 %t2, 0
316 %t4 = select i1 %t3, i32 %a, i32 %b
320 define i32 @ptestz_v8i32_signbits(<8 x i32> %c, i32 %a, i32 %b) {
321 ; AVX1-LABEL: ptestz_v8i32_signbits:
323 ; AVX1-NEXT: movl %edi, %eax
324 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
325 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
326 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
327 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
328 ; AVX1-NEXT: vptest %ymm0, %ymm0
329 ; AVX1-NEXT: cmovnel %esi, %eax
330 ; AVX1-NEXT: vzeroupper
333 ; AVX2-LABEL: ptestz_v8i32_signbits:
335 ; AVX2-NEXT: movl %edi, %eax
336 ; AVX2-NEXT: vmovmskps %ymm0, %ecx
337 ; AVX2-NEXT: testl %ecx, %ecx
338 ; AVX2-NEXT: cmovnel %esi, %eax
339 ; AVX2-NEXT: vzeroupper
341 %t1 = ashr <8 x i32> %c, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
342 %t2 = bitcast <8 x i32> %t1 to <4 x i64>
343 %t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
344 %t4 = icmp ne i32 %t3, 0
345 %t5 = select i1 %t4, i32 %a, i32 %b
349 define i32 @ptestz_v8i16_signbits(<8 x i16> %c, i32 %a, i32 %b) {
350 ; CHECK-LABEL: ptestz_v8i16_signbits:
352 ; CHECK-NEXT: movl %edi, %eax
353 ; CHECK-NEXT: vpmovmskb %xmm0, %ecx
354 ; CHECK-NEXT: testl $43690, %ecx # imm = 0xAAAA
355 ; CHECK-NEXT: cmovnel %esi, %eax
357 %t1 = ashr <8 x i16> %c, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
358 %t2 = bitcast <8 x i16> %t1 to <2 x i64>
359 %t3 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t2, <2 x i64> <i64 -1, i64 -1>)
360 %t4 = icmp ne i32 %t3, 0
361 %t5 = select i1 %t4, i32 %a, i32 %b
365 define i32 @ptestz_v32i8_signbits(<32 x i8> %c, i32 %a, i32 %b) {
366 ; AVX1-LABEL: ptestz_v32i8_signbits:
368 ; AVX1-NEXT: movl %edi, %eax
369 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
370 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
371 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
372 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
373 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
374 ; AVX1-NEXT: vptest %ymm0, %ymm0
375 ; AVX1-NEXT: cmovnel %esi, %eax
376 ; AVX1-NEXT: vzeroupper
379 ; AVX2-LABEL: ptestz_v32i8_signbits:
381 ; AVX2-NEXT: movl %edi, %eax
382 ; AVX2-NEXT: vpmovmskb %ymm0, %ecx
383 ; AVX2-NEXT: testl %ecx, %ecx
384 ; AVX2-NEXT: cmovnel %esi, %eax
385 ; AVX2-NEXT: vzeroupper
387 %t1 = ashr <32 x i8> %c, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
388 %t2 = bitcast <32 x i8> %t1 to <4 x i64>
389 %t3 = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %t2, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
390 %t4 = icmp ne i32 %t3, 0
391 %t5 = select i1 %t4, i32 %a, i32 %b
396 ; testz(or(extract_lo(X),extract_hi(X),or(extract_lo(Y),extract_hi(Y)) -> testz(X,Y)
399 define i32 @ptestz_v2i64_concat(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) {
400 ; CHECK-LABEL: ptestz_v2i64_concat:
402 ; CHECK-NEXT: movl %edi, %eax
403 ; CHECK-NEXT: vptest %ymm1, %ymm0
404 ; CHECK-NEXT: cmovnel %esi, %eax
405 ; CHECK-NEXT: vzeroupper
407 %t1 = shufflevector <4 x i64> %c, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
408 %t2 = shufflevector <4 x i64> %c, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
409 %t3 = shufflevector <4 x i64> %d, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
410 %t4 = shufflevector <4 x i64> %d, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
411 %t5 = or <2 x i64> %t1, %t2
412 %t6 = or <2 x i64> %t4, %t3
413 %t7 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t5, <2 x i64> %t6)
414 %t8 = icmp ne i32 %t7, 0
415 %t9 = select i1 %t8, i32 %a, i32 %b
419 ; FIXME: Foldable to ptest(xor(%0,%1),xor(%0,%1))
420 define i1 @PR38788_0(<4 x i32> %0, <4 x i32> %1) {
421 ; CHECK-LABEL: PR38788_0:
423 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
424 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
425 ; CHECK-NEXT: vptest %xmm1, %xmm0
426 ; CHECK-NEXT: setb %al
428 %3 = icmp eq <4 x i32> %0, %1
429 %4 = sext <4 x i1> %3 to <4 x i32>
430 %5 = bitcast <4 x i32> %4 to <2 x i64>
431 %6 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %5, <2 x i64> <i64 -1, i64 -1>)
432 %7 = icmp eq i32 %6, 1
436 define i1 @PR38788_1(<16 x i16> %0, <16 x i16> %1) {
437 ; AVX1-LABEL: PR38788_1:
439 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
440 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
441 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
442 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
443 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
444 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
445 ; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
446 ; AVX1-NEXT: vptest %ymm1, %ymm0
447 ; AVX1-NEXT: setae %al
448 ; AVX1-NEXT: vzeroupper
451 ; AVX2-LABEL: PR38788_1:
453 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
454 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
455 ; AVX2-NEXT: vptest %ymm1, %ymm0
456 ; AVX2-NEXT: setae %al
457 ; AVX2-NEXT: vzeroupper
459 %3 = icmp eq <16 x i16> %0, %1
460 %4 = sext <16 x i1> %3 to <16 x i16>
461 %5 = bitcast <16 x i16> %4 to <4 x i64>
462 %6 = tail call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %5, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>)
463 %7 = icmp eq i32 %6, 0
467 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
468 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
469 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
471 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>)
472 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>)
473 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>)