2 * @brief Xapian::ESet::Internal class
4 /* Copyright (C) 2008,2010,2011,2013,2016 Olly Betts
5 * Copyright (C) 2011 Action Without Borders
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "esetinternal.h"
26 #include "xapian/enquire.h"
27 #include "xapian/expanddecider.h"
28 #include "backends/database.h"
30 #include "api/omenquireinternal.h"
31 #include "expandweight.h"
33 #include "ortermlist.h"
35 #include "api/termlist.h"
36 #include "unicode/description_append.h"
48 Internal::ExpandTerm::get_description() const
50 string
desc("ExpandTerm(");
53 description_append(desc
, term
);
58 template<class CLASS
> struct delete_ptr
{
59 void operator()(CLASS
*p
) const { delete p
; }
62 struct CompareTermListSizeAscending
{
63 bool operator()(const TermList
*a
, const TermList
*b
) const {
64 return a
->get_approx_size() > b
->get_approx_size();
68 /** Build a tree of binary TermList objects like QueryOptimiser does for
72 build_termlist_tree(const Xapian::Database
&db
, const RSet
& rset
)
74 Assert(!rset
.empty());
76 const set
<Xapian::docid
> & docids
= rset
.internal
->get_items();
78 vector
<TermList
*> termlists
;
79 termlists
.reserve(docids
.size());
82 const size_t multiplier
= db
.internal
.size();
83 set
<Xapian::docid
>::const_iterator i
;
84 for (i
= docids
.begin(); i
!= docids
.end(); ++i
) {
85 Xapian::docid realdid
= (*i
- 1) / multiplier
+ 1;
86 Xapian::doccount dbnumber
= (*i
- 1) % multiplier
;
88 // Push NULL first to avoid leaking the new TermList if push_back()
90 termlists
.push_back(0);
91 termlists
.back() = db
.internal
[dbnumber
]->open_term_list(realdid
);
92 termlists
.back()->shard_index
= dbnumber
;
95 Assert(!termlists
.empty());
96 if (termlists
.size() == 1) return termlists
[0];
98 // Make termlists into a heap so that the longest termlist is at the
100 make_heap(termlists
.begin(), termlists
.end(),
101 CompareTermListSizeAscending());
103 // Now build a tree of binary TermList objects. The algorithm used to
104 // build the tree is like that used to build an optimal Huffman coding
105 // tree. If we called next() repeatedly, this arrangement would
106 // minimise the number of method calls. Generally we don't actually do
107 // that, but this arrangement is still likely to be a good one, and it
108 // does minimise the work in the worst case.
110 AssertRel(termlists
.size(), >=, 2);
111 // We build the tree such that at each branch:
113 // l.get_approx_size() >= r.get_approx_size()
115 // We do this so that the OrTermList class can be optimised
116 // assuming that this is the case.
117 TermList
* r
= termlists
.front();
118 pop_heap(termlists
.begin(), termlists
.end(),
119 CompareTermListSizeAscending());
120 termlists
.pop_back();
121 TermList
* l
= termlists
.front();
123 TermList
* pl
= new OrTermList(l
, r
);
125 if (termlists
.size() == 1) return pl
;
127 pop_heap(termlists
.begin(), termlists
.end(),
128 CompareTermListSizeAscending());
129 termlists
.back() = pl
;
130 push_heap(termlists
.begin(), termlists
.end(),
131 CompareTermListSizeAscending());
134 for_each(termlists
.begin(), termlists
.end(), delete_ptr
<TermList
>());
140 ESet::Internal::expand(Xapian::termcount max_esize
,
141 const Xapian::Database
& db
,
143 const Xapian::ExpandDecider
* edecider
,
144 Xapian::Internal::ExpandWeight
& eweight
,
147 LOGCALL_VOID(EXPAND
, "ESet::Internal::expand", max_esize
| db
| rset
| edecider
| eweight
);
148 // These two cases are handled by our caller.
150 Assert(!rset
.empty());
151 // This method should only be called once for a given ESet::Internal, so
152 // check we're empty.
154 Assert(items
.empty());
156 AutoPtr
<TermList
> tree(build_termlist_tree(db
, rset
));
159 bool is_heap
= false;
161 // See if the root needs replacing.
162 TermList
* new_root
= tree
->next();
164 LOGLINE(EXPAND
, "Replacing the root of the termlist tree");
165 tree
.reset(new_root
);
168 if (tree
->at_end()) break;
170 string term
= tree
->get_termname();
172 // If there's an ExpandDecider, see if it accepts the term.
173 if (edecider
&& !(*edecider
)(term
)) continue;
177 /* Set up the ExpandWeight by clearing the existing statistics and
178 collecting statistics for the new term. */
179 eweight
.collect_stats(tree
.get(), term
);
181 double wt
= eweight
.get_weight();
183 // If the weights are equal, we prefer the lexically smaller term and
184 // so we use "<=" not "<" here.
185 if (wt
<= min_wt
) continue;
187 items
.push_back(Xapian::Internal::ExpandTerm(wt
, term
));
189 // The candidate ESet is overflowing, so remove the worst element in it
191 if (items
.size() > max_esize
) {
192 if (rare(!is_heap
)) {
194 make_heap(items
.begin(), items
.end());
196 push_heap(items
.begin(), items
.end());
198 pop_heap(items
.begin(), items
.end());
200 min_wt
= items
.front().wt
;
204 // Now sort the contents of the new ESet.
206 sort_heap(items
.begin(), items
.end());
208 sort(items
.begin(), items
.end());
213 ESet::Internal::get_description() const
215 string
desc("ESet::Internal(ebound=");
218 vector
<Xapian::Internal::ExpandTerm
>::const_iterator i
;
219 for (i
= items
.begin(); i
!= items
.end(); ++i
) {
221 desc
+= i
->get_description();
230 ESet::ESet(const ESet
& o
) : internal(o
.internal
) { }
233 ESet::operator=(const ESet
& o
)
235 internal
= o
.internal
;
239 ESet::ESet(ESet
&&) = default;
242 ESet::operator=(ESet
&&) = default;
249 if (!internal
.get()) return 0;
250 return Xapian::termcount(internal
->items
.size());
254 ESet::get_ebound() const
256 if (!internal
.get()) return 0;
257 return internal
->ebound
;
261 ESet::get_description() const
263 string desc
= "ESet(";
265 desc
+= internal
->get_description();
272 ESetIterator::operator*() const
274 return (eset
.internal
->items
.end() - off_from_end
)->term
;
278 ESetIterator::get_weight() const
280 return (eset
.internal
->items
.end() - off_from_end
)->wt
;
284 ESetIterator::get_description() const
286 string desc
= "ESetIterator(";
287 if (eset
.internal
.get())
288 desc
+= str(eset
.internal
->items
.size() - off_from_end
);