scriptindex: Fix weird error cases
[xapian.git] / xapian-core / matcher / collapser.h
blob6be355bff81c585f517329941227996f4c8fe9c8
1 /** @file
2 * @brief Collapse documents with the same collapse key during the match.
3 */
4 /* Copyright (C) 2009,2011 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 #define XAPIAN_INCLUDED_COLLAPSER_H
24 #include "backends/document.h"
25 #include "msetcmp.h"
26 #include "api/omenquireinternal.h"
27 #include "api/postlist.h"
29 #include <map>
30 #include <vector>
32 /// Enumeration reporting how a document was handled by the Collapser.
33 typedef enum {
34 EMPTY,
35 ADDED,
36 REJECTED,
37 REPLACED
38 } collapse_result;
40 /// Class tracking information for a given value of the collapse key.
41 class CollapseData {
42 /** Currently kept MSet entries for this value of the collapse key.
44 * If collapse_max > 1, then this is a min-heap once collapse_count > 0.
46 * FIXME: We expect collapse_max to be small, so perhaps we should
47 * preallocate space for that many entries and/or allocate space in
48 * larger blocks to divvy up?
50 std::vector<Xapian::Internal::MSetItem> items;
52 /// The highest weight of a document we've rejected.
53 double next_best_weight;
55 /// The number of documents we've rejected.
56 Xapian::doccount collapse_count;
58 public:
59 /// Construct with the given MSetItem @a item.
60 explicit CollapseData(const Xapian::Internal::MSetItem & item)
61 : items(1, item), next_best_weight(0), collapse_count(0) {
62 items[0].collapse_key = std::string();
65 /** Handle a new MSetItem with this collapse key value.
67 * @param item The new item.
68 * @param collapse_max Max no. of items for each collapse key value.
69 * @param mcmp MSetItem comparison functor.
70 * @param[out] old_item Replaced item (when REPLACED is returned).
72 * @return How @a item was handled: ADDED, REJECTED or REPLACED.
74 collapse_result add_item(const Xapian::Internal::MSetItem & item,
75 Xapian::doccount collapse_max,
76 MSetCmp mcmp,
77 Xapian::Internal::MSetItem & old_item);
79 /// The highest weight of a document we've rejected.
80 double get_next_best_weight() const { return next_best_weight; }
82 /// The number of documents we've rejected.
83 Xapian::doccount get_collapse_count() const { return collapse_count; }
86 /// The Collapser class tracks collapse keys and the documents they match.
87 class Collapser {
88 /// Map from collapse key values to the items we're keeping for them.
89 std::map<std::string, CollapseData> table;
91 /// How many items we're currently keeping in @a table.
92 Xapian::doccount entry_count;
94 /** How many documents have we seen without a collapse key?
96 * We use this statistic to improve matches_lower_bound.
98 Xapian::doccount no_collapse_key;
100 /** How many documents with duplicate collapse keys we have ignored.
102 * We use this statistic to improve matches_estimated (by considering
103 * the rate of collapsing) and matches_upper_bound.
105 Xapian::doccount dups_ignored;
107 /** How many documents we've considered for collapsing.
109 * We use this statistic to improve matches_estimated (by considering
110 * the rate of collapsing).
112 Xapian::doccount docs_considered;
114 /** The value slot we're getting collapse keys from. */
115 Xapian::valueno slot;
117 /** The maximum number of items to keep for each collapse key value. */
118 Xapian::doccount collapse_max;
120 public:
121 /// Replaced item when REPLACED is returned by @a collapse().
122 Xapian::Internal::MSetItem old_item;
124 Collapser(Xapian::valueno slot_, Xapian::doccount collapse_max_)
125 : entry_count(0), no_collapse_key(0), dups_ignored(0),
126 docs_considered(0), slot(slot_), collapse_max(collapse_max_),
127 old_item(0, 0) { }
129 /// Return true if collapsing is active for this match.
130 operator bool() const { return collapse_max != 0; }
132 /** Handle a new MSetItem.
134 * @param item The new item.
135 * @param postlist PostList to try to get collapse key from
136 * (this happens for a remote match).
137 * @param doc Document for getting values.
138 * @param mcmp MSetItem comparison functor.
140 * @return How @a item was handled: EMPTY, ADDED, REJECTED or REPLACED.
142 collapse_result process(Xapian::Internal::MSetItem & item,
143 PostList * postlist,
144 Xapian::Document::Internal & vsdoc,
145 MSetCmp mcmp);
147 Xapian::doccount get_collapse_count(const std::string & collapse_key,
148 int percent_cutoff,
149 double min_weight) const;
151 Xapian::doccount get_docs_considered() const { return docs_considered; }
153 Xapian::doccount get_dups_ignored() const { return dups_ignored; }
155 Xapian::doccount entries() const { return entry_count; }
157 Xapian::doccount get_matches_lower_bound() const;
159 bool empty() const { return table.empty(); }
162 #endif // XAPIAN_INCLUDED_COLLAPSER_H