1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl < %s | FileCheck %s
3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4 target triple = "x86_64-unknown-unknown"
6 define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) {
7 ; CHECK-LABEL: stack_fold_vp2intersectd:
8 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
9 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
10 %2 = load <16 x i32>, <16 x i32>* %a
11 %3 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %2, <16 x i32> %b)
12 %4 = extractvalue { <16 x i1>, <16 x i1> } %3, 0
13 store <16 x i1> %4, <16 x i1>* %m0
14 %5 = extractvalue { <16 x i1>, <16 x i1> } %3, 1
15 store <16 x i1> %5, <16 x i1>* %m1
18 declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>)
20 define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
21 ; CHECK-LABEL: stack_fold_vp2intersectq:
22 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
23 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
24 %2 = load <8 x i64>, <8 x i64>* %a
25 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %2, <8 x i64> %b)
26 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
27 store <8 x i1> %4, <8 x i1>* %m0
28 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
29 store <8 x i1> %5, <8 x i1>* %m1
32 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>)
34 define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
35 ; CHECK-LABEL: stack_fold_vp2intersectd_256:
36 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
37 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
38 %2 = load <8 x i32>, <8 x i32>* %a
39 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %2, <8 x i32> %b)
40 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
41 store <8 x i1> %4, <8 x i1>* %m0
42 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
43 store <8 x i1> %5, <8 x i1>* %m1
46 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
48 define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
49 ; CHECK-LABEL: stack_fold_vp2intersectq_256:
50 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
52 %2 = load <4 x i64>, <4 x i64>* %a
53 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %2, <4 x i64> %b)
54 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
55 store <4 x i1> %4, <4 x i1>* %m0
56 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
57 store <4 x i1> %5, <4 x i1>* %m1
60 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
62 define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
63 ; CHECK-LABEL: stack_fold_vp2intersectd_128:
64 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
65 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
66 %2 = load <4 x i32>, <4 x i32>* %a
67 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %2, <4 x i32> %b)
68 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
69 store <4 x i1> %4, <4 x i1>* %m0
70 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
71 store <4 x i1> %5, <4 x i1>* %m1
74 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
76 define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) {
77 ; CHECK-LABEL: stack_fold_vp2intersectq_128:
78 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
79 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
80 %2 = load <2 x i64>, <2 x i64>* %a
81 %3 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %2, <2 x i64> %b)
82 %4 = extractvalue { <2 x i1>, <2 x i1> } %3, 0
83 store <2 x i1> %4, <2 x i1>* %m0
84 %5 = extractvalue { <2 x i1>, <2 x i1> } %3, 1
85 store <2 x i1> %5, <2 x i1>* %m1
88 declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)