2 * @brief MatchSpy implementation.
4 /* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015 Olly Betts
5 * Copyright (C) 2007,2009 Lemur Consulting Ltd
6 * Copyright (C) 2010 Richard Boulton
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <xapian/matchspy.h>
26 #include <xapian/document.h>
27 #include <xapian/error.h>
28 #include <xapian/queryparser.h>
29 #include <xapian/registry.h>
39 #include "net/length.h"
40 #include "stringutils.h"
48 using namespace Xapian
;
49 using Xapian::Internal::intrusive_ptr
;
51 MatchSpy::~MatchSpy() {}
54 MatchSpy::clone() const {
55 throw UnimplementedError("MatchSpy not suitable for use with remote searches - clone() method unimplemented");
59 MatchSpy::name() const {
60 throw UnimplementedError("MatchSpy not suitable for use with remote searches - name() method unimplemented");
64 MatchSpy::serialise() const {
65 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise() method unimplemented");
69 MatchSpy::unserialise(const string
&, const Registry
&) const {
70 throw UnimplementedError("MatchSpy not suitable for use with remote searches - unserialise() method unimplemented");
74 MatchSpy::serialise_results() const {
75 throw UnimplementedError("MatchSpy not suitable for use with remote searches - serialise_results() method unimplemented");
79 MatchSpy::merge_results(const string
&) {
80 throw UnimplementedError("MatchSpy not suitable for use with remote searches - merge_results() method unimplemented");
84 MatchSpy::get_description() const {
85 return "Xapian::MatchSpy()";
88 XAPIAN_NORETURN(static void unsupported_method());
89 static void unsupported_method() {
90 throw Xapian::InvalidOperationError("Method not supported for this type of termlist");
93 /// A termlist iterator over the contents of a ValueCountMatchSpy
94 class ValueCountTermList final
: public TermList
{
96 map
<string
, Xapian::doccount
>::const_iterator it
;
98 intrusive_ptr
<Xapian::ValueCountMatchSpy::Internal
> spy
;
101 explicit ValueCountTermList(ValueCountMatchSpy::Internal
* spy_
)
104 it
= spy
->values
.begin();
108 string
get_termname() const {
114 Xapian::doccount
get_termfreq() const {
130 TermList
* skip_to(const string
& term
) {
131 while (it
!= spy
->values
.end() && it
->first
< term
) {
138 bool at_end() const {
140 return it
== spy
->values
.end();
143 Xapian::termcount
get_approx_size() const { unsupported_method(); return 0; }
144 Xapian::termcount
get_wdf() const { unsupported_method(); return 0; }
145 Xapian::PositionIterator
positionlist_begin() const {
146 unsupported_method();
147 return Xapian::PositionIterator();
149 Xapian::termcount
positionlist_count() const { unsupported_method(); return 0; }
152 /** A string with a corresponding frequency.
154 class StringAndFrequency
{
156 Xapian::doccount frequency
;
158 /// Construct a StringAndFrequency object.
159 StringAndFrequency(const std::string
& str_
, Xapian::doccount frequency_
)
160 : str(str_
), frequency(frequency_
) {}
162 /// Return the string.
163 std::string
get_string() const { return str
; }
165 /// Return the frequency.
166 Xapian::doccount
get_frequency() const { return frequency
; }
169 /** Compare two StringAndFrequency objects.
171 * The comparison is firstly by frequency (higher is better), then by string
172 * (earlier lexicographic sort is better).
174 class StringAndFreqCmpByFreq
{
176 /// Default constructor
177 StringAndFreqCmpByFreq() {}
179 /// Return true if a has a higher frequency than b.
180 /// If equal, compare by the str, to provide a stable sort order.
181 bool operator()(const StringAndFrequency
&a
,
182 const StringAndFrequency
&b
) const {
183 if (a
.get_frequency() > b
.get_frequency()) return true;
184 if (a
.get_frequency() < b
.get_frequency()) return false;
185 return a
.get_string() < b
.get_string();
189 /// A termlist iterator over a vector of StringAndFrequency objects.
190 class StringAndFreqTermList final
: public TermList
{
192 vector
<StringAndFrequency
>::const_iterator it
;
195 vector
<StringAndFrequency
> values
;
197 /** init should be called after the values have been set, but before
205 string
get_termname() const {
208 return it
->get_string();
211 Xapian::doccount
get_termfreq() const {
214 return it
->get_frequency();
227 TermList
* skip_to(const string
& term
) {
228 while (it
!= values
.end() && it
->get_string() < term
) {
235 bool at_end() const {
237 return it
== values
.end();
240 Xapian::termcount
get_approx_size() const { unsupported_method(); return 0; }
241 Xapian::termcount
get_wdf() const { unsupported_method(); return 0; }
242 Xapian::PositionIterator
positionlist_begin() const {
243 unsupported_method();
244 return Xapian::PositionIterator();
246 Xapian::termcount
positionlist_count() const { unsupported_method(); return 0; }
249 /** Get the most frequent items from a map from string to frequency.
251 * This takes input such as that in ValueCountMatchSpy::Internal::values and
252 * returns a vector of the most frequent items in the input.
254 * @param result A vector which will be filled with the most frequent
255 * items, in descending order of frequency. Items with
256 * the same frequency will be sorted in ascending
257 * alphabetical order.
259 * @param items The map from string to frequency, from which the most
260 * frequent items will be selected.
262 * @param maxitems The maximum number of items to return.
265 get_most_frequent_items(vector
<StringAndFrequency
> & result
,
266 const map
<string
, doccount
> & items
,
270 result
.reserve(maxitems
);
271 StringAndFreqCmpByFreq cmpfn
;
274 for (map
<string
, doccount
>::const_iterator i
= items
.begin();
275 i
!= items
.end(); ++i
) {
276 Assert(result
.size() <= maxitems
);
277 result
.push_back(StringAndFrequency(i
->first
, i
->second
));
278 if (result
.size() > maxitems
) {
279 // Make the list back into a heap.
281 // Only the new element isn't in the right place.
282 push_heap(result
.begin(), result
.end(), cmpfn
);
284 // Need to build heap from scratch.
285 make_heap(result
.begin(), result
.end(), cmpfn
);
288 pop_heap(result
.begin(), result
.end(), cmpfn
);
294 sort_heap(result
.begin(), result
.end(), cmpfn
);
296 sort(result
.begin(), result
.end(), cmpfn
);
301 ValueCountMatchSpy::operator()(const Document
&doc
, double) {
302 Assert(internal
.get());
304 string
val(doc
.get_value(internal
->slot
));
305 if (!val
.empty()) ++(internal
->values
[val
]);
309 ValueCountMatchSpy::values_begin() const
311 Assert(internal
.get());
312 return Xapian::TermIterator(new ValueCountTermList(internal
.get()));
316 ValueCountMatchSpy::top_values_begin(size_t maxvalues
) const
318 Assert(internal
.get());
319 AutoPtr
<StringAndFreqTermList
> termlist(new StringAndFreqTermList
);
320 get_most_frequent_items(termlist
->values
, internal
->values
, maxvalues
);
322 return Xapian::TermIterator(termlist
.release());
326 ValueCountMatchSpy::clone() const {
327 Assert(internal
.get());
328 return new ValueCountMatchSpy(internal
->slot
);
332 ValueCountMatchSpy::name() const {
333 return "Xapian::ValueCountMatchSpy";
337 ValueCountMatchSpy::serialise() const {
338 Assert(internal
.get());
340 result
+= encode_length(internal
->slot
);
345 ValueCountMatchSpy::unserialise(const string
& s
, const Registry
&) const
347 const char * p
= s
.data();
348 const char * end
= p
+ s
.size();
351 decode_length(&p
, end
, new_slot
);
353 throw NetworkError("Junk at end of serialised ValueCountMatchSpy");
356 return new ValueCountMatchSpy(new_slot
);
360 ValueCountMatchSpy::serialise_results() const {
361 LOGCALL(REMOTE
, string
, "ValueCountMatchSpy::serialise_results", NO_ARGS
);
362 Assert(internal
.get());
364 result
+= encode_length(internal
->total
);
365 result
+= encode_length(internal
->values
.size());
366 for (map
<string
, doccount
>::const_iterator i
= internal
->values
.begin();
367 i
!= internal
->values
.end(); ++i
) {
368 result
+= encode_length(i
->first
.size());
370 result
+= encode_length(i
->second
);
376 ValueCountMatchSpy::merge_results(const string
& s
) {
377 LOGCALL_VOID(REMOTE
, "ValueCountMatchSpy::merge_results", s
);
378 Assert(internal
.get());
379 const char * p
= s
.data();
380 const char * end
= p
+ s
.size();
383 decode_length(&p
, end
, n
);
384 internal
->total
+= n
;
386 map
<string
, doccount
>::size_type items
;
387 decode_length(&p
, end
, items
);
390 decode_length_and_check(&p
, end
, vallen
);
391 string
val(p
, vallen
);
394 decode_length(&p
, end
, freq
);
395 internal
->values
[val
] += freq
;
399 throw NetworkError("Junk at end of serialised ValueCountMatchSpy "
405 ValueCountMatchSpy::get_description() const {
406 string d
= "ValueCountMatchSpy(";
407 if (internal
.get()) {
408 d
+= str(internal
->total
);
409 d
+= " docs seen, looking in ";
410 d
+= str(internal
->values
.size());