1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) {
8 ; CHECK-LABEL: stack_fold_vp2intersectd:
10 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
15 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
16 ; CHECK-NEXT: kmovw %k0, (%rsi)
17 ; CHECK-NEXT: kmovw %k1, (%rdx)
18 ; CHECK-NEXT: vzeroupper
20 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
21 %2 = load <16 x i32>, <16 x i32>* %a
22 %3 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %2, <16 x i32> %b)
23 %4 = extractvalue { <16 x i1>, <16 x i1> } %3, 0
24 store <16 x i1> %4, <16 x i1>* %m0
25 %5 = extractvalue { <16 x i1>, <16 x i1> } %3, 1
26 store <16 x i1> %5, <16 x i1>* %m1
29 declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>)
31 define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
32 ; CHECK-LABEL: stack_fold_vp2intersectq:
34 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
38 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
39 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
40 ; CHECK-NEXT: kmovw %k1, %eax
41 ; CHECK-NEXT: kmovw %k0, %ecx
42 ; CHECK-NEXT: movb %cl, (%rsi)
43 ; CHECK-NEXT: movb %al, (%rdx)
44 ; CHECK-NEXT: vzeroupper
46 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
47 %2 = load <8 x i64>, <8 x i64>* %a
48 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %2, <8 x i64> %b)
49 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
50 store <8 x i1> %4, <8 x i1>* %m0
51 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
52 store <8 x i1> %5, <8 x i1>* %m1
55 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>)
57 define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
58 ; CHECK-LABEL: stack_fold_vp2intersectd_256:
60 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
64 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
65 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
66 ; CHECK-NEXT: kmovw %k1, %eax
67 ; CHECK-NEXT: kmovw %k0, %ecx
68 ; CHECK-NEXT: movb %cl, (%rsi)
69 ; CHECK-NEXT: movb %al, (%rdx)
70 ; CHECK-NEXT: vzeroupper
72 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
73 %2 = load <8 x i32>, <8 x i32>* %a
74 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %2, <8 x i32> %b)
75 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
76 store <8 x i1> %4, <8 x i1>* %m0
77 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
78 store <8 x i1> %5, <8 x i1>* %m1
81 declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
83 define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
84 ; CHECK-LABEL: stack_fold_vp2intersectq_256:
86 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
90 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
91 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
92 ; CHECK-NEXT: kmovw %k1, %eax
93 ; CHECK-NEXT: kmovw %k0, %ecx
94 ; CHECK-NEXT: movb %cl, (%rsi)
95 ; CHECK-NEXT: movb %al, (%rdx)
96 ; CHECK-NEXT: vzeroupper
98 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
99 %2 = load <4 x i64>, <4 x i64>* %a
100 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %2, <4 x i64> %b)
101 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
102 store <4 x i1> %4, <4 x i1>* %m0
103 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
104 store <4 x i1> %5, <4 x i1>* %m1
107 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
109 define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
110 ; CHECK-LABEL: stack_fold_vp2intersectd_128:
112 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
115 ; CHECK-NEXT: #NO_APP
116 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
117 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
118 ; CHECK-NEXT: kmovw %k1, %eax
119 ; CHECK-NEXT: kmovw %k0, %ecx
120 ; CHECK-NEXT: movb %cl, (%rsi)
121 ; CHECK-NEXT: movb %al, (%rdx)
123 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
124 %2 = load <4 x i32>, <4 x i32>* %a
125 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %2, <4 x i32> %b)
126 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
127 store <4 x i1> %4, <4 x i1>* %m0
128 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
129 store <4 x i1> %5, <4 x i1>* %m1
132 declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
134 define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) {
135 ; CHECK-LABEL: stack_fold_vp2intersectq_128:
137 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
140 ; CHECK-NEXT: #NO_APP
141 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
142 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
143 ; CHECK-NEXT: kmovw %k1, %eax
144 ; CHECK-NEXT: kmovw %k0, %ecx
145 ; CHECK-NEXT: movb %cl, (%rsi)
146 ; CHECK-NEXT: movb %al, (%rdx)
148 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
149 %2 = load <2 x i64>, <2 x i64>* %a
150 %3 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %2, <2 x i64> %b)
151 %4 = extractvalue { <2 x i1>, <2 x i1> } %3, 0
152 store <2 x i1> %4, <2 x i1>* %m0
153 %5 = extractvalue { <2 x i1>, <2 x i1> } %3, 1
154 store <2 x i1> %5, <2 x i1>* %m1
157 declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)