[ci] Fix clang-santisers job for GHA change
[xapian.git] / xapian-core / matcher / collapser.h
blob544fd99017966c099643e6338d63ebb1c49f791e
1 /** @file
2 * @brief Collapse documents with the same collapse key during the match.
3 */
4 /* Copyright (C) 2009,2011,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
24 #include "backends/documentinternal.h"
25 #include "msetcmp.h"
26 #include "omassert.h"
27 #include "api/result.h"
29 #include <unordered_map>
30 #include <vector>
32 /// Enumeration reporting how a result will be handled by the Collapser.
33 typedef enum {
34 EMPTY,
35 NEW,
36 ADD,
37 REJECT,
38 REPLACE
39 } collapse_result;
41 /// Class tracking information for a given value of the collapse key.
42 class CollapseData {
43 /** Currently kept MSet entries for this value of the collapse key.
45 * If collapse_max > 1, then this is a min-heap once collapse_count > 0.
47 * The first member of the pair is the index into proto_mset.results
48 * and the second is the docid of the entry (used to detect if the
49 * entry in proto_mset.results we were referring to has been dropped).
51 * FIXME: We expect collapse_max to be small, so perhaps we should
52 * preallocate space for that many entries and/or allocate space in
53 * larger blocks to divvy up?
55 std::vector<std::pair<Xapian::doccount, Xapian::docid>> items;
57 /// The highest weight of a document we've rejected.
58 double next_best_weight = 0.0;
60 /// The number of documents we've rejected.
61 Xapian::doccount collapse_count = 0;
63 public:
64 /// Construct with the given item.
65 CollapseData(Xapian::doccount item, Xapian::docid did)
66 : items(1, { item, did })
67 { }
69 /** Check a new result with this collapse key value.
71 * If this method determines the action to take is ADD or REPLACE, then
72 * the proto-mset should be updated and then add_item() called to complete
73 * the update of the CollapseData (if the result doesn't actually get
74 * added, then it's OK not to follow up with a call to add_item()).
76 * @param results The results so far.
77 * @param result The new result.
78 * @param collapse_max Max no. of items for each collapse key value.
79 * @param mcmp Result comparison functor.
80 * @param[out] old_item Item to be replaced (when REPLACE is returned).
82 * @return How to handle @a result: ADD, REJECT or REPLACE.
84 collapse_result check_item(const std::vector<Result>& results,
85 const Result& result,
86 Xapian::doccount collapse_max,
87 MSetCmp mcmp,
88 Xapian::doccount& old_item);
90 /** Set item after constructing with a placeholder.
92 * @param item The new item (index into results).
94 void set_item(Xapian::doccount item);
96 /** Complete update of new result with this collapse key value.
98 * @param results The results so far.
99 * @param item The new item (index into results).
100 * @param collapse_max Max no. of items for each collapse key value.
101 * @param mcmp Result comparison functor.
103 void add_item(const std::vector<Result>& results,
104 Xapian::doccount item,
105 Xapian::doccount collapse_max,
106 MSetCmp mcmp);
108 /** Process relocation of entry in results.
110 * @param from The old item (index into results).
111 * @param to The new item (index into results).
113 void result_has_moved(Xapian::doccount from, Xapian::doccount to) {
114 // Yes, this *is* a linear search, but only through up to collapse_max
115 // items and we expect collapse_max to be very small (it's the maximum
116 // number of items to keep for each collapse key value).
117 for (auto&& item : items) {
118 if (item.first == from) {
119 item.first = to;
120 return;
123 // The entry ought to be present.
124 Assert(false);
127 /// The highest weight of a document we've rejected.
128 double get_next_best_weight() const { return next_best_weight; }
130 /// The number of documents we've rejected.
131 Xapian::doccount get_collapse_count() const { return collapse_count; }
134 /// The Collapser class tracks collapse keys and the documents they match.
135 class Collapser {
136 /// Map from collapse key values to the items we're keeping for them.
137 std::unordered_map<std::string, CollapseData> table;
139 /// How many items we're currently keeping in @a table.
140 Xapian::doccount entry_count = 0;
142 /** How many documents have we seen without a collapse key?
144 * We use this statistic to improve matches_lower_bound.
146 Xapian::doccount no_collapse_key = 0;
148 /** How many documents with duplicate collapse keys we have ignored.
150 * We use this statistic to improve matches_estimated (by considering
151 * the rate of collapsing) and matches_upper_bound.
153 Xapian::doccount dups_ignored = 0;
155 /** How many documents we've considered for collapsing.
157 * We use this statistic to improve matches_estimated (by considering
158 * the rate of collapsing).
160 Xapian::doccount docs_considered = 0;
162 /** The value slot we're getting collapse keys from. */
163 Xapian::valueno slot;
165 /** The maximum number of items to keep for each collapse key value. */
166 Xapian::doccount collapse_max;
168 std::vector<Result>& results;
170 MSetCmp mcmp;
172 /** Pointer to CollapseData when NEW or ADD is in progress. */
173 CollapseData* ptr = NULL;
175 /// Adapt @a mcmp to be usable with min_heap.
176 bool operator()(Xapian::doccount a, Xapian::doccount b) const {
177 return mcmp(results[a], results[b]);
180 public:
181 /// Replaced item when REPLACE is returned by @a collapse().
182 Xapian::doccount old_item = 0;
184 Collapser(Xapian::valueno slot_,
185 Xapian::doccount collapse_max_,
186 std::vector<Result>& results_,
187 MSetCmp mcmp_)
188 : slot(slot_),
189 collapse_max(collapse_max_),
190 results(results_),
191 mcmp(mcmp_) { }
193 /// Return true if collapsing is active for this match.
194 operator bool() const { return collapse_max != 0; }
196 /** Check a new result.
198 * If this method determines the action to take is NEW or ADD then the
199 * proto-mset should be updated and then process() called to complete the
200 * update (if the result doesn't actually get added, then it's OK not
201 * to follow up with a call to process()).
203 * @param result The new result.
204 * @param vsdoc Document for getting values.
206 * @return How to handle @a result: EMPTY, NEW, ADD, REJECT or REPLACE.
208 collapse_result check(Result& result,
209 Xapian::Document::Internal & vsdoc);
211 /** Handle a new Result.
213 * @param action The collapse_result returned by check().
214 * @param item The new item (index into results).
216 void process(collapse_result action, Xapian::doccount item);
218 /** Process relocation of entry in results.
220 * @param from The old item (index into results).
221 * @param to The new item (index into results).
223 void result_has_moved(Xapian::doccount from, Xapian::doccount to) {
224 const std::string& collapse_key = results[to].get_collapse_key();
225 if (collapse_key.empty()) {
226 return;
228 auto it = table.find(collapse_key);
229 if (rare(it == table.end())) {
230 // The entry ought to be present.
231 Assert(false);
232 return;
235 CollapseData& collapse_data = it->second;
236 collapse_data.result_has_moved(from, to);
239 Xapian::doccount get_collapse_count(const std::string & collapse_key,
240 int percent_threshold,
241 double min_weight) const;
243 Xapian::doccount get_docs_considered() const { return docs_considered; }
245 Xapian::doccount get_dups_ignored() const { return dups_ignored; }
247 Xapian::doccount get_entries() const { return entry_count; }
249 Xapian::doccount get_matches_lower_bound() const;
251 void finalise(double min_weight, int percent_threshold);
254 /** Simpler version of Collapser used when merging MSet objects.
256 * We merge results in descending rank order, so collapsing is much simpler
257 * than during the match - we just need to discard documents if we've already
258 * seen collapse_max with the same key.
260 class CollapserLite {
261 /// Map from collapse key values to collapse counts.
262 std::unordered_map<std::string, Xapian::doccount> table;
264 /// How many items we're currently keeping in @a table.
265 Xapian::doccount entry_count = 0;
267 /** How many documents have we seen without a collapse key?
269 * We use this statistic to improve matches_lower_bound.
271 Xapian::doccount no_collapse_key = 0;
273 /** How many documents with duplicate collapse keys we have ignored.
275 * We use this statistic to improve matches_estimated (by considering
276 * the rate of collapsing) and matches_upper_bound.
278 Xapian::doccount dups_ignored = 0;
280 /** How many documents we've considered for collapsing.
282 * We use this statistic to improve matches_estimated (by considering
283 * the rate of collapsing).
285 Xapian::doccount docs_considered = 0;
287 /** The maximum number of items to keep for each collapse key value. */
288 Xapian::doccount collapse_max;
290 public:
291 CollapserLite(Xapian::doccount collapse_max_)
292 : collapse_max(collapse_max_) {}
294 /// Return true if collapsing is active for this match.
295 operator bool() const { return collapse_max != 0; }
297 /** Try to add a new key.
299 * @return true if accepted; false if rejected.
301 bool add(const std::string& key) {
302 ++docs_considered;
304 if (key.empty()) {
305 ++no_collapse_key;
306 return true;
309 auto r = table.emplace(key, 1);
310 if (r.second) {
311 // New entry, set to 1.
312 } else if (r.first->second == collapse_max) {
313 // Already seen collapse_max with this key so reject.
314 ++dups_ignored;
315 return false;
316 } else {
317 // Increment count.
318 ++r.first->second;
320 ++entry_count;
321 return true;
324 Xapian::doccount get_docs_considered() const { return docs_considered; }
326 Xapian::doccount get_dups_ignored() const { return dups_ignored; }
328 Xapian::doccount get_entries() const { return entry_count; }
330 Xapian::doccount get_matches_lower_bound() const {
331 return no_collapse_key + entry_count;
334 void finalise(std::vector<Result>& results, int percent_threshold) {
335 if (table.empty() || results.empty())
336 return;
338 // We need to fill in collapse_count values in results using the
339 // information stored in table.
340 Xapian::doccount todo = entry_count;
341 for (Result& result : results) {
342 const std::string& key = result.get_collapse_key();
343 if (key.empty())
344 continue;
346 // Adjust collapse_count.
347 if (percent_threshold) {
348 // FIXME: We can probably do better here.
349 result.set_collapse_count(1);
350 } else {
351 auto c = result.get_collapse_count() + table[key];
352 result.set_collapse_count(c);
355 if (--todo == 0) {
356 // Terminate early if we've handled all non-empty entries.
357 break;
363 #endif // XAPIAN_INCLUDED_COLLAPSER_H