2 * @brief Collapse documents with the same collapse key during the match.
4 /* Copyright (C) 2009,2011,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
24 #include "backends/documentinternal.h"
27 #include "api/result.h"
29 #include <unordered_map>
32 /// Enumeration reporting how a result will be handled by the Collapser.
41 /// Class tracking information for a given value of the collapse key.
43 /** Currently kept MSet entries for this value of the collapse key.
45 * If collapse_max > 1, then this is a min-heap once collapse_count > 0.
47 * The first member of the pair is the index into proto_mset.results
48 * and the second is the docid of the entry (used to detect if the
49 * entry in proto_mset.results we were referring to has been dropped).
51 * FIXME: We expect collapse_max to be small, so perhaps we should
52 * preallocate space for that many entries and/or allocate space in
53 * larger blocks to divvy up?
55 std::vector
<std::pair
<Xapian::doccount
, Xapian::docid
>> items
;
57 /// The highest weight of a document we've rejected.
58 double next_best_weight
= 0.0;
60 /// The number of documents we've rejected.
61 Xapian::doccount collapse_count
= 0;
64 /// Construct with the given item.
65 CollapseData(Xapian::doccount item
, Xapian::docid did
)
66 : items(1, { item
, did
})
69 /** Check a new result with this collapse key value.
71 * If this method determines the action to take is ADD or REPLACE, then
72 * the proto-mset should be updated and then add_item() called to complete
73 * the update of the CollapseData (if the result doesn't actually get
74 * added, then it's OK not to follow up with a call to add_item()).
76 * @param results The results so far.
77 * @param result The new result.
78 * @param collapse_max Max no. of items for each collapse key value.
79 * @param mcmp Result comparison functor.
80 * @param[out] old_item Item to be replaced (when REPLACE is returned).
82 * @return How to handle @a result: ADD, REJECT or REPLACE.
84 collapse_result
check_item(const std::vector
<Result
>& results
,
86 Xapian::doccount collapse_max
,
88 Xapian::doccount
& old_item
);
90 /** Set item after constructing with a placeholder.
92 * @param item The new item (index into results).
94 void set_item(Xapian::doccount item
);
96 /** Complete update of new result with this collapse key value.
98 * @param results The results so far.
99 * @param item The new item (index into results).
100 * @param collapse_max Max no. of items for each collapse key value.
101 * @param mcmp Result comparison functor.
103 void add_item(const std::vector
<Result
>& results
,
104 Xapian::doccount item
,
105 Xapian::doccount collapse_max
,
108 /** Process relocation of entry in results.
110 * @param from The old item (index into results).
111 * @param to The new item (index into results).
113 void result_has_moved(Xapian::doccount from
, Xapian::doccount to
) {
114 // Yes, this *is* a linear search, but only through up to collapse_max
115 // items and we expect collapse_max to be very small (it's the maximum
116 // number of items to keep for each collapse key value).
117 for (auto&& item
: items
) {
118 if (item
.first
== from
) {
123 // The entry ought to be present.
127 /// The highest weight of a document we've rejected.
128 double get_next_best_weight() const { return next_best_weight
; }
130 /// The number of documents we've rejected.
131 Xapian::doccount
get_collapse_count() const { return collapse_count
; }
134 /// The Collapser class tracks collapse keys and the documents they match.
136 /// Map from collapse key values to the items we're keeping for them.
137 std::unordered_map
<std::string
, CollapseData
> table
;
139 /// How many items we're currently keeping in @a table.
140 Xapian::doccount entry_count
= 0;
142 /** How many documents have we seen without a collapse key?
144 * We use this statistic to improve matches_lower_bound.
146 Xapian::doccount no_collapse_key
= 0;
148 /** How many documents with duplicate collapse keys we have ignored.
150 * We use this statistic to improve matches_estimated (by considering
151 * the rate of collapsing) and matches_upper_bound.
153 Xapian::doccount dups_ignored
= 0;
155 /** How many documents we've considered for collapsing.
157 * We use this statistic to improve matches_estimated (by considering
158 * the rate of collapsing).
160 Xapian::doccount docs_considered
= 0;
162 /** The value slot we're getting collapse keys from. */
163 Xapian::valueno slot
;
165 /** The maximum number of items to keep for each collapse key value. */
166 Xapian::doccount collapse_max
;
168 std::vector
<Result
>& results
;
172 /** Pointer to CollapseData when NEW or ADD is in progress. */
173 CollapseData
* ptr
= NULL
;
175 /// Adapt @a mcmp to be usable with min_heap.
176 bool operator()(Xapian::doccount a
, Xapian::doccount b
) const {
177 return mcmp(results
[a
], results
[b
]);
181 /// Replaced item when REPLACE is returned by @a collapse().
182 Xapian::doccount old_item
= 0;
184 Collapser(Xapian::valueno slot_
,
185 Xapian::doccount collapse_max_
,
186 std::vector
<Result
>& results_
,
189 collapse_max(collapse_max_
),
193 /// Return true if collapsing is active for this match.
194 operator bool() const { return collapse_max
!= 0; }
196 /** Check a new result.
198 * If this method determines the action to take is NEW or ADD then the
199 * proto-mset should be updated and then process() called to complete the
200 * update (if the result doesn't actually get added, then it's OK not
201 * to follow up with a call to process()).
203 * @param result The new result.
204 * @param vsdoc Document for getting values.
206 * @return How to handle @a result: EMPTY, NEW, ADD, REJECT or REPLACE.
208 collapse_result
check(Result
& result
,
209 Xapian::Document::Internal
& vsdoc
);
211 /** Handle a new Result.
213 * @param action The collapse_result returned by check().
214 * @param item The new item (index into results).
216 void process(collapse_result action
, Xapian::doccount item
);
218 /** Process relocation of entry in results.
220 * @param from The old item (index into results).
221 * @param to The new item (index into results).
223 void result_has_moved(Xapian::doccount from
, Xapian::doccount to
) {
224 const std::string
& collapse_key
= results
[to
].get_collapse_key();
225 if (collapse_key
.empty()) {
228 auto it
= table
.find(collapse_key
);
229 if (rare(it
== table
.end())) {
230 // The entry ought to be present.
235 CollapseData
& collapse_data
= it
->second
;
236 collapse_data
.result_has_moved(from
, to
);
239 Xapian::doccount
get_collapse_count(const std::string
& collapse_key
,
240 int percent_threshold
,
241 double min_weight
) const;
243 Xapian::doccount
get_docs_considered() const { return docs_considered
; }
245 Xapian::doccount
get_dups_ignored() const { return dups_ignored
; }
247 Xapian::doccount
get_entries() const { return entry_count
; }
249 Xapian::doccount
get_matches_lower_bound() const;
251 void finalise(double min_weight
, int percent_threshold
);
254 /** Simpler version of Collapser used when merging MSet objects.
256 * We merge results in descending rank order, so collapsing is much simpler
257 * than during the match - we just need to discard documents if we've already
258 * seen collapse_max with the same key.
260 class CollapserLite
{
261 /// Map from collapse key values to collapse counts.
262 std::unordered_map
<std::string
, Xapian::doccount
> table
;
264 /// How many items we're currently keeping in @a table.
265 Xapian::doccount entry_count
= 0;
267 /** How many documents have we seen without a collapse key?
269 * We use this statistic to improve matches_lower_bound.
271 Xapian::doccount no_collapse_key
= 0;
273 /** How many documents with duplicate collapse keys we have ignored.
275 * We use this statistic to improve matches_estimated (by considering
276 * the rate of collapsing) and matches_upper_bound.
278 Xapian::doccount dups_ignored
= 0;
280 /** How many documents we've considered for collapsing.
282 * We use this statistic to improve matches_estimated (by considering
283 * the rate of collapsing).
285 Xapian::doccount docs_considered
= 0;
287 /** The maximum number of items to keep for each collapse key value. */
288 Xapian::doccount collapse_max
;
291 CollapserLite(Xapian::doccount collapse_max_
)
292 : collapse_max(collapse_max_
) {}
294 /// Return true if collapsing is active for this match.
295 operator bool() const { return collapse_max
!= 0; }
297 /** Try to add a new key.
299 * @return true if accepted; false if rejected.
301 bool add(const std::string
& key
) {
309 auto r
= table
.emplace(key
, 1);
311 // New entry, set to 1.
312 } else if (r
.first
->second
== collapse_max
) {
313 // Already seen collapse_max with this key so reject.
324 Xapian::doccount
get_docs_considered() const { return docs_considered
; }
326 Xapian::doccount
get_dups_ignored() const { return dups_ignored
; }
328 Xapian::doccount
get_entries() const { return entry_count
; }
330 Xapian::doccount
get_matches_lower_bound() const {
331 return no_collapse_key
+ entry_count
;
334 void finalise(std::vector
<Result
>& results
, int percent_threshold
) {
335 if (table
.empty() || results
.empty())
338 // We need to fill in collapse_count values in results using the
339 // information stored in table.
340 Xapian::doccount todo
= entry_count
;
341 for (Result
& result
: results
) {
342 const std::string
& key
= result
.get_collapse_key();
346 // Adjust collapse_count.
347 if (percent_threshold
) {
348 // FIXME: We can probably do better here.
349 result
.set_collapse_count(1);
351 auto c
= result
.get_collapse_count() + table
[key
];
352 result
.set_collapse_count(c
);
356 // Terminate early if we've handled all non-empty entries.
363 #endif // XAPIAN_INCLUDED_COLLAPSER_H