Handle <title> in SVG
[xapian.git] / xapian-core / api / matchspy.cc
bloba74fa3af2e0006b4986fc9d6f9570a3bd5c066d5
1 /** @file
2 * @brief MatchSpy implementation.
3 */
4 /* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
5 * Copyright (C) 2007,2009 Lemur Consulting Ltd
6 * Copyright (C) 2010 Richard Boulton
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
24 #include <xapian/matchspy.h>
26 #include <xapian/document.h>
27 #include <xapian/error.h>
28 #include <xapian/queryparser.h>
29 #include <xapian/registry.h>
31 #include <map>
32 #include <string>
33 #include <vector>
35 #include "autoptr.h"
36 #include "debuglog.h"
37 #include "noreturn.h"
38 #include "omassert.h"
39 #include "net/length.h"
40 #include "stringutils.h"
41 #include "str.h"
42 #include "termlist.h"
44 #include <cfloat>
45 #include <cmath>
47 using namespace std;
48 using namespace Xapian;
49 using Xapian::Internal::intrusive_ptr;
51 MatchSpy::~MatchSpy() {}
53 MatchSpy *
54 MatchSpy::clone() const {
55 throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
58 string
59 MatchSpy::name() const {
60 throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
63 string
64 MatchSpy::serialise() const {
65 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
68 MatchSpy *
69 MatchSpy::unserialise(const string &, const Registry &) const {
70 throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
73 string
74 MatchSpy::serialise_results() const {
75 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
78 void
79 MatchSpy::merge_results(const string &) {
80 throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
83 string
84 MatchSpy::get_description() const {
85 return "Xapian::MatchSpy()";
88 XAPIAN_NORETURN(static void unsupported_method());
89 static void unsupported_method() {
90 throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
93 /// A termlist iterator over the contents of a ValueCountMatchSpy
94 class ValueCountTermList final : public TermList {
95 private:
96 map<string, Xapian::doccount>::const_iterator it;
97 bool started;
98 intrusive_ptr<Xapian::ValueCountMatchSpy::Internal> spy;
99 public:
101 explicit ValueCountTermList(ValueCountMatchSpy::Internal * spy_)
102 : spy(spy_)
104 it = spy->values.begin();
105 started = false;
108 string get_termname() const {
109 Assert(started);
110 Assert(!at_end());
111 return it->first;
114 Xapian::doccount get_termfreq() const {
115 Assert(started);
116 Assert(!at_end());
117 return it->second;
120 TermList * next() {
121 if (!started) {
122 started = true;
123 } else {
124 Assert(!at_end());
125 ++it;
127 return NULL;
130 TermList * skip_to(const string & term) {
131 while (it != spy->values.end() && it->first < term) {
132 ++it;
134 started = true;
135 return NULL;
138 bool at_end() const {
139 Assert(started);
140 return it == spy->values.end();
143 Xapian::termcount get_approx_size() const { unsupported_method(); return 0; }
144 Xapian::termcount get_wdf() const { unsupported_method(); return 0; }
145 Xapian::PositionIterator positionlist_begin() const {
146 unsupported_method();
147 return Xapian::PositionIterator();
149 Xapian::termcount positionlist_count() const { unsupported_method(); return 0; }
152 /** A string with a corresponding frequency.
154 class StringAndFrequency {
155 std::string str;
156 Xapian::doccount frequency;
157 public:
158 /// Construct a StringAndFrequency object.
159 StringAndFrequency(const std::string & str_, Xapian::doccount frequency_)
160 : str(str_), frequency(frequency_) {}
162 /// Return the string.
163 std::string get_string() const { return str; }
165 /// Return the frequency.
166 Xapian::doccount get_frequency() const { return frequency; }
169 /** Compare two StringAndFrequency objects.
171 * The comparison is firstly by frequency (higher is better), then by string
172 * (earlier lexicographic sort is better).
174 class StringAndFreqCmpByFreq {
175 public:
176 /// Default constructor
177 StringAndFreqCmpByFreq() {}
179 /// Return true if a has a higher frequency than b.
180 /// If equal, compare by the str, to provide a stable sort order.
181 bool operator()(const StringAndFrequency &a,
182 const StringAndFrequency &b) const {
183 if (a.get_frequency() > b.get_frequency()) return true;
184 if (a.get_frequency() < b.get_frequency()) return false;
185 return a.get_string() < b.get_string();
189 /// A termlist iterator over a vector of StringAndFrequency objects.
190 class StringAndFreqTermList final : public TermList {
191 private:
192 vector<StringAndFrequency>::const_iterator it;
193 bool started;
194 public:
195 vector<StringAndFrequency> values;
197 /** init should be called after the values have been set, but before
198 * iteration begins.
200 void init() {
201 it = values.begin();
202 started = false;
205 string get_termname() const {
206 Assert(started);
207 Assert(!at_end());
208 return it->get_string();
211 Xapian::doccount get_termfreq() const {
212 Assert(started);
213 Assert(!at_end());
214 return it->get_frequency();
217 TermList * next() {
218 if (!started) {
219 started = true;
220 } else {
221 Assert(!at_end());
222 ++it;
224 return NULL;
227 TermList * skip_to(const string & term) {
228 while (it != values.end() && it->get_string() < term) {
229 ++it;
231 started = true;
232 return NULL;
235 bool at_end() const {
236 Assert(started);
237 return it == values.end();
240 Xapian::termcount get_approx_size() const { unsupported_method(); return 0; }
241 Xapian::termcount get_wdf() const { unsupported_method(); return 0; }
242 Xapian::PositionIterator positionlist_begin() const {
243 unsupported_method();
244 return Xapian::PositionIterator();
246 Xapian::termcount positionlist_count() const { unsupported_method(); return 0; }
249 /** Get the most frequent items from a map from string to frequency.
251 * This takes input such as that in ValueCountMatchSpy::Internal::values and
252 * returns a vector of the most frequent items in the input.
254 * @param result A vector which will be filled with the most frequent
255 * items, in descending order of frequency. Items with
256 * the same frequency will be sorted in ascending
257 * alphabetical order.
259 * @param items The map from string to frequency, from which the most
260 * frequent items will be selected.
262 * @param maxitems The maximum number of items to return.
264 static void
265 get_most_frequent_items(vector<StringAndFrequency> & result,
266 const map<string, doccount> & items,
267 size_t maxitems)
269 result.clear();
270 result.reserve(maxitems);
271 StringAndFreqCmpByFreq cmpfn;
272 bool is_heap(false);
274 for (map<string, doccount>::const_iterator i = items.begin();
275 i != items.end(); ++i) {
276 Assert(result.size() <= maxitems);
277 result.push_back(StringAndFrequency(i->first, i->second));
278 if (result.size() > maxitems) {
279 // Make the list back into a heap.
280 if (is_heap) {
281 // Only the new element isn't in the right place.
282 push_heap(result.begin(), result.end(), cmpfn);
283 } else {
284 // Need to build heap from scratch.
285 make_heap(result.begin(), result.end(), cmpfn);
286 is_heap = true;
288 pop_heap(result.begin(), result.end(), cmpfn);
289 result.pop_back();
293 if (is_heap) {
294 sort_heap(result.begin(), result.end(), cmpfn);
295 } else {
296 sort(result.begin(), result.end(), cmpfn);
300 void
301 ValueCountMatchSpy::operator()(const Document &doc, double) {
302 Assert(internal.get());
303 ++(internal->total);
304 string val(doc.get_value(internal->slot));
305 if (!val.empty()) ++(internal->values[val]);
308 TermIterator
309 ValueCountMatchSpy::values_begin() const
311 Assert(internal.get());
312 return Xapian::TermIterator(new ValueCountTermList(internal.get()));
315 TermIterator
316 ValueCountMatchSpy::top_values_begin(size_t maxvalues) const
318 Assert(internal.get());
319 AutoPtr<StringAndFreqTermList> termlist(new StringAndFreqTermList);
320 get_most_frequent_items(termlist->values, internal->values, maxvalues);
321 termlist->init();
322 return Xapian::TermIterator(termlist.release());
325 MatchSpy *
326 ValueCountMatchSpy::clone() const {
327 Assert(internal.get());
328 return new ValueCountMatchSpy(internal->slot);
331 string
332 ValueCountMatchSpy::name() const {
333 return "Xapian::ValueCountMatchSpy";
336 string
337 ValueCountMatchSpy::serialise() const {
338 Assert(internal.get());
339 string result;
340 result += encode_length(internal->slot);
341 return result;
344 MatchSpy *
345 ValueCountMatchSpy::unserialise(const string & s, const Registry &) const
347 const char * p = s.data();
348 const char * end = p + s.size();
350 valueno new_slot;
351 decode_length(&p, end, new_slot);
352 if (p != end) {
353 throw NetworkError("Junk at end of serialised ValueCountMatchSpy");
356 return new ValueCountMatchSpy(new_slot);
359 string
360 ValueCountMatchSpy::serialise_results() const {
361 LOGCALL(REMOTE, string, "ValueCountMatchSpy::serialise_results", NO_ARGS);
362 Assert(internal.get());
363 string result;
364 result += encode_length(internal->total);
365 result += encode_length(internal->values.size());
366 for (map<string, doccount>::const_iterator i = internal->values.begin();
367 i != internal->values.end(); ++i) {
368 result += encode_length(i->first.size());
369 result += i->first;
370 result += encode_length(i->second);
372 RETURN(result);
375 void
376 ValueCountMatchSpy::merge_results(const string & s) {
377 LOGCALL_VOID(REMOTE, "ValueCountMatchSpy::merge_results", s);
378 Assert(internal.get());
379 const char * p = s.data();
380 const char * end = p + s.size();
382 Xapian::doccount n;
383 decode_length(&p, end, n);
384 internal->total += n;
386 map<string, doccount>::size_type items;
387 decode_length(&p, end, items);
388 while (items != 0) {
389 size_t vallen;
390 decode_length_and_check(&p, end, vallen);
391 string val(p, vallen);
392 p += vallen;
393 doccount freq;
394 decode_length(&p, end, freq);
395 internal->values[val] += freq;
396 --items;
398 if (p != end) {
399 throw NetworkError("Junk at end of serialised ValueCountMatchSpy "
400 "results");
404 string
405 ValueCountMatchSpy::get_description() const {
406 string d = "ValueCountMatchSpy(";
407 if (internal.get()) {
408 d += str(internal->total);
409 d += " docs seen, looking in ";
410 d += str(internal->values.size());
411 d += " slots)";
412 } else {
413 d += ")";
415 return d;