2 * @brief API for running queries
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2011,2012,2013,2014,2015,2016 Olly Betts
7 * Copyright 2009 Lemur Consulting Ltd
8 * Copyright 2011 Action Without Borders
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #ifndef XAPIAN_INCLUDED_ENQUIRE_H
27 #define XAPIAN_INCLUDED_ENQUIRE_H
29 #if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
30 # error Never use <xapian/enquire.h> directly; include <xapian.h> instead.
33 #include "xapian/deprecated.h"
36 #include <xapian/attributes.h>
37 #include <xapian/eset.h>
38 #include <xapian/intrusive_ptr.h>
39 #include <xapian/mset.h>
40 #include <xapian/stem.h>
41 #include <xapian/types.h>
42 #include <xapian/termiterator.h>
43 #include <xapian/visibility.h>
56 /** A relevance set (R-Set).
57 * This is the set of documents which are marked as relevant, for use
58 * in modifying the term weights, and in performing query expansion.
60 class XAPIAN_VISIBILITY_DEFAULT RSet
{
62 /// Class holding details of RSet
65 /// @private @internal Reference counted internals.
66 Xapian::Internal::intrusive_ptr
<Internal
> internal
;
69 RSet(const RSet
&rset
);
71 /// Assignment operator
72 void operator=(const RSet
&rset
);
74 #ifdef XAPIAN_MOVE_SEMANTICS
78 /// Move assignment operator.
79 RSet
& operator=(RSet
&& o
);
82 /// Default constructor
88 /** The number of documents in this R-Set */
89 Xapian::doccount
size() const;
91 /** Test if this R-Set is empty */
94 /// Add a document to the relevance set.
95 void add_document(Xapian::docid did
);
97 /// Add a document to the relevance set.
98 void add_document(const Xapian::MSetIterator
& i
) { add_document(*i
); }
100 /// Remove a document from the relevance set.
101 void remove_document(Xapian::docid did
);
103 /// Remove a document from the relevance set.
104 void remove_document(const Xapian::MSetIterator
& i
) { remove_document(*i
); }
106 /// Test if a given document in the relevance set.
107 bool contains(Xapian::docid did
) const;
109 /// Test if a given document in the relevance set.
110 bool contains(const Xapian::MSetIterator
& i
) const { return contains(*i
); }
112 /// Return a string describing this object.
113 std::string
get_description() const;
116 /** Base class for matcher decision functor.
118 class XAPIAN_VISIBILITY_DEFAULT MatchDecider
{
119 /// Don't allow assignment.
120 void operator=(const MatchDecider
&);
122 /// Don't allow copying.
123 MatchDecider(const MatchDecider
&);
126 /// Default constructor
129 /** Decide whether we want this document to be in the MSet.
131 * @param doc The document to test.
133 * @return true if the document is acceptable, or false if the document
134 * should be excluded from the MSet.
136 virtual bool operator()(const Xapian::Document
&doc
) const = 0;
139 virtual ~MatchDecider();
142 /** This class provides an interface to the information retrieval
143 * system for the purpose of searching.
145 * Databases are usually opened lazily, so exceptions may not be
146 * thrown where you would expect them to be. You should catch
147 * Xapian::Error exceptions when calling any method in Xapian::Enquire.
149 * @exception Xapian::InvalidArgumentError will be thrown if an invalid
150 * argument is supplied, for example, an unknown database type.
152 class XAPIAN_VISIBILITY_DEFAULT Enquire
{
154 /// Copying is allowed (and is cheap).
155 Enquire(const Enquire
& other
);
157 /// Assignment is allowed (and is cheap).
158 void operator=(const Enquire
& other
);
160 #ifdef XAPIAN_MOVE_SEMANTICS
161 /// Move constructor.
162 Enquire(Enquire
&& o
);
164 /// Move assignment operator.
165 Enquire
& operator=(Enquire
&& o
);
169 /// @private @internal Reference counted internals.
170 Xapian::Internal::intrusive_ptr
<Internal
> internal
;
172 /** Create a Xapian::Enquire object.
174 * This specification cannot be changed once the Xapian::Enquire is
175 * opened: you must create a new Xapian::Enquire object to access a
176 * different database, or set of databases.
178 * The database supplied must have been initialised (ie, must not be
179 * the result of calling the Database::Database() constructor). If
180 * you need to handle a situation where you have no databases
181 * gracefully, a database created with DB_BACKEND_INMEMORY can be
182 * passed here to provide a completely empty database.
184 * @param database Specification of the database or databases to
187 * @exception Xapian::InvalidArgumentError will be thrown if an
188 * empty Database object is supplied.
190 explicit Enquire(const Database
&database
);
192 /** Create a Xapian::Enquire object.
194 * This specification cannot be changed once the Xapian::Enquire is
195 * opened: you must create a new Xapian::Enquire object to access a
196 * different database, or set of databases.
198 * The database supplied must have been initialised (ie, must not be
199 * the result of calling the Database::Database() constructor). If
200 * you need to handle a situation where you have no databases
201 * gracefully, a database created with DB_BACKEND_INMEMORY can be
202 * passed here to provide a completely empty database.
204 * @param database Specification of the database or databases to
206 * @param errorhandler_ This parameter is deprecated (since Xapian
207 * 1.3.1), and as of 1.3.5 it's ignored completely.
209 * @exception Xapian::InvalidArgumentError will be thrown if an
210 * empty Database object is supplied.
212 XAPIAN_DEPRECATED_EX(Enquire(const Database
&database
, ErrorHandler
* errorhandler_
));
214 /** Close the Xapian::Enquire object.
218 /** Set the query to run.
220 * @param query the new query to run.
221 * @param qlen the query length to use in weight calculations -
222 * by default the sum of the wqf of all terms is used.
224 void set_query(const Xapian::Query
& query
, Xapian::termcount qlen
= 0);
226 /** Get the current query.
228 * If called before set_query(), this will return a default
229 * initialised Query object.
231 const Xapian::Query
& get_query() const;
235 * This matchspy will be called with some of the documents which match
236 * the query, during the match process. Exactly which of the matching
237 * documents are passed to it depends on exactly when certain
238 * optimisations occur during the match process, but it can be
239 * controlled to some extent by setting the @a checkatleast parameter
242 * In particular, if there are enough matching documents, at least the
243 * number specified by @a checkatleast will be passed to the matchspy.
244 * This means that you can force the matchspy to be shown all matching
245 * documents by setting @a checkatleast to the number of documents in
248 * @param spy The MatchSpy subclass to add. The caller must
249 * ensure that this remains valid while the Enquire
250 * object remains active, or until @a
251 * clear_matchspies() is called.
253 void add_matchspy(MatchSpy
* spy
);
255 /** Remove all the matchspies.
257 void clear_matchspies();
259 /** Set the weighting scheme to use for queries.
261 * @param weight_ the new weighting scheme. If no weighting scheme
262 * is specified, the default is BM25 with the
263 * default parameters.
265 void set_weighting_scheme(const Weight
&weight_
);
267 /** Set the weighting scheme to use for expansion.
269 * If you don't call this method, the default is as if you'd used:
271 * get_expansion_scheme("trad");
273 * @param eweightname_ A string in lowercase specifying the name of
274 * the scheme to be used. The following schemes
275 * are currently available:
276 * "bo1" : The Bo1 scheme for query expansion.
277 * "trad" : The TradWeight scheme for query expansion.
278 * @param expand_k_ The parameter required for TradWeight query expansion.
279 * A default value of 1.0 is used if none is specified.
281 void set_expansion_scheme(const std::string
&eweightname_
,
282 double expand_k_
= 1.0) const;
284 /** Set the collapse key to use for queries.
286 * @param collapse_key value number to collapse on - at most one MSet
287 * entry with each particular value will be returned
288 * (default is Xapian::BAD_VALUENO which means no collapsing).
290 * @param collapse_max Max number of items with the same key to leave
291 * after collapsing (default 1).
293 * The MSet returned by get_mset() will have only the "best"
294 * (at most) @a collapse_max entries with each particular
295 * value of @a collapse_key ("best" being highest ranked - i.e.
296 * highest weight or highest sorting key).
298 * An example use might be to create a value for each document
299 * containing an MD5 hash of the document contents. Then
300 * duplicate documents from different sources can be eliminated at
301 * search time by collapsing with @a collapse_max = 1 (it's better
302 * to eliminate duplicates at index time, but this may not be
303 * always be possible - for example the search may be over more
304 * than one Xapian database).
306 * Another use is to group matches in a particular category (e.g.
307 * you might collapse a mailing list search on the Subject: so
308 * that there's only one result per discussion thread). In this
309 * case you can use get_collapse_count() to give the user some
310 * idea how many other results there are. And if you index the
311 * Subject: as a boolean term as well as putting it in a value,
312 * you can offer a link to a non-collapsed search restricted to
313 * that thread using a boolean filter.
315 void set_collapse_key(Xapian::valueno collapse_key
,
316 Xapian::doccount collapse_max
= 1);
318 /** Ordering of docids.
320 * Parameter to Enquire::set_docid_order().
323 /** docids sort in ascending order (default) */
325 /** docids sort in descending order. */
327 /** docids sort in whatever order is most efficient for the
332 /** Set sort order for document IDs.
334 * This order only has an effect on documents which would otherwise
335 * have equal rank. When ordering by relevance without a sort key,
336 * this means documents with equal weight. For a boolean match
337 * with no sort key, this means all documents. And if a sort key
338 * is used, this means documents with the same sort key (and also equal
339 * weight if ordering on relevance before or after the sort key).
341 * @param order This can be:
342 * - Xapian::Enquire::ASCENDING
343 * docids sort in ascending order (default)
344 * - Xapian::Enquire::DESCENDING
345 * docids sort in descending order
346 * - Xapian::Enquire::DONT_CARE
347 * docids sort in whatever order is most efficient for the backend
349 * Note: If you add documents in strict date order, then a boolean
350 * search - i.e. set_weighting_scheme(Xapian::BoolWeight()) - with
351 * set_docid_order(Xapian::Enquire::DESCENDING) is an efficient
352 * way to perform "sort by date, newest first", and with
353 * set_docid_order(Xapian::Enquire::ASCENDING) a very efficient way
354 * to perform "sort by date, oldest first".
356 void set_docid_order(docid_order order
);
358 /** Set the percentage and/or weight cutoffs.
360 * @param percent_cutoff Minimum percentage score for returned
361 * documents. If a document has a lower percentage score than this,
362 * it will not appear in the MSet. If your intention is to return
363 * only matches which contain all the terms in the query, then
364 * it's more efficient to use Xapian::Query::OP_AND instead of
365 * Xapian::Query::OP_OR in the query than to use set_cutoff(100).
366 * (default 0 => no percentage cut-off).
367 * @param weight_cutoff Minimum weight for a document to be returned.
368 * If a document has a lower score that this, it will not appear
369 * in the MSet. It is usually only possible to choose an
370 * appropriate weight for cutoff based on the results of a
371 * previous run of the same query; this is thus mainly useful for
372 * alerting operations. The other potential use is with a user
373 * specified weighting scheme.
374 * (default 0 => no weight cut-off).
376 void set_cutoff(int percent_cutoff
, double weight_cutoff
= 0);
378 /** Set the sorting to be by relevance only.
380 * This is the default.
382 void set_sort_by_relevance();
384 /** Set the sorting to be by value only.
386 * Note that sorting by values uses a string comparison, so to use
387 * this to sort by a numeric value you'll need to store the numeric
388 * values in a manner which sorts appropriately. For example, you
389 * could use Xapian::sortable_serialise() (which works for floating
390 * point numbers as well as integers), or store numbers padded with
391 * leading zeros or spaces, or with the number of digits prepended.
393 * @param sort_key value number to sort on.
395 * @param reverse If true, reverses the sort order.
397 void set_sort_by_value(Xapian::valueno sort_key
, bool reverse
);
399 /** Set the sorting to be by key generated from values only.
401 * @param sorter The functor to use for generating keys.
403 * @param reverse If true, reverses the sort order.
405 void set_sort_by_key(Xapian::KeyMaker
* sorter
, bool reverse
);
407 /** Set the sorting to be by value, then by relevance for documents
408 * with the same value.
410 * Note that sorting by values uses a string comparison, so to use
411 * this to sort by a numeric value you'll need to store the numeric
412 * values in a manner which sorts appropriately. For example, you
413 * could use Xapian::sortable_serialise() (which works for floating
414 * point numbers as well as integers), or store numbers padded with
415 * leading zeros or spaces, or with the number of digits prepended.
417 * @param sort_key value number to sort on.
419 * @param reverse If true, reverses the sort order.
421 void set_sort_by_value_then_relevance(Xapian::valueno sort_key
,
424 /** Set the sorting to be by keys generated from values, then by
425 * relevance for documents with identical keys.
427 * @param sorter The functor to use for generating keys.
429 * @param reverse If true, reverses the sort order.
431 void set_sort_by_key_then_relevance(Xapian::KeyMaker
* sorter
,
434 /** Set the sorting to be by relevance then value.
436 * Note that sorting by values uses a string comparison, so to use
437 * this to sort by a numeric value you'll need to store the numeric
438 * values in a manner which sorts appropriately. For example, you
439 * could use Xapian::sortable_serialise() (which works for floating
440 * point numbers as well as integers), or store numbers padded with
441 * leading zeros or spaces, or with the number of digits prepended.
443 * Note that with the default BM25 weighting scheme parameters,
444 * non-identical documents will rarely have the same weight, so
445 * this setting will give very similar results to
446 * set_sort_by_relevance(). It becomes more useful with particular
447 * BM25 parameter settings (e.g. BM25Weight(1,0,1,0,0)) or custom
450 * @param sort_key value number to sort on.
452 * @param reverse If true, reverses the sort order of sort_key.
453 * Beware that in 1.2.16 and earlier, the sense
454 * of this parameter was incorrectly inverted
455 * and inconsistent with the other set_sort_by_...
456 * methods. This was fixed in 1.2.17, so make that
457 * version a minimum requirement if this detail
458 * matters to your application.
460 void set_sort_by_relevance_then_value(Xapian::valueno sort_key
,
463 /** Set the sorting to be by relevance, then by keys generated from
466 * Note that with the default BM25 weighting scheme parameters,
467 * non-identical documents will rarely have the same weight, so
468 * this setting will give very similar results to
469 * set_sort_by_relevance(). It becomes more useful with particular
470 * BM25 parameter settings (e.g. BM25Weight(1,0,1,0,0)) or custom
473 * @param sorter The functor to use for generating keys.
475 * @param reverse If true, reverses the sort order of the generated
476 * keys. Beware that in 1.2.16 and earlier, the sense
477 * of this parameter was incorrectly inverted
478 * and inconsistent with the other set_sort_by_...
479 * methods. This was fixed in 1.2.17, so make that
480 * version a minimum requirement if this detail
481 * matters to your application.
483 void set_sort_by_relevance_then_key(Xapian::KeyMaker
* sorter
,
486 /** Set a time limit for the match.
488 * Matches with check_at_least set high can take a long time in some
489 * cases. You can set a time limit on this, after which check_at_least
490 * will be turned off.
492 * @param time_limit time in seconds after which to disable
493 * check_at_least (default: 0.0 which means no
498 * This feature is currently supported on platforms which support POSIX
499 * interval timers. Interaction with the remote backend when using
500 * multiple databases may have bugs. There's not currently a way to
501 * force the match to end after a certain time.
503 void set_time_limit(double time_limit
);
505 /** Get (a portion of) the match set for the current query.
507 * @param first the first item in the result set to return.
508 * A value of zero corresponds to the first item
509 * returned being that with the highest score.
510 * A value of 10 corresponds to the first 10 items
511 * being ignored, and the returned items starting
513 * @param maxitems the maximum number of items to return. If you
514 * want all matches, then you can pass the result
515 * of calling get_doccount() on the Database object
516 * (though if you are doing this so you can filter
517 * results, you are likely to get much better
518 * performance by using Xapian's match-time filtering
519 * features instead). You can pass 0 for maxitems
520 * which will give you an empty MSet with valid
521 * statistics (such as get_matches_estimated())
522 * calculated without looking at any postings, which
523 * is very quick, but means the estimates may be
524 * more approximate and the bounds may be much
526 * @param checkatleast the minimum number of items to check. Because
527 * the matcher optimises, it won't consider every
528 * document which might match, so the total number
529 * of matches is estimated. Setting checkatleast
530 * forces it to consider at least this many matches
531 * and so allows for reliable paging links.
532 * @param omrset the relevance set to use when performing the query.
533 * @param mdecider a decision functor to use to decide whether a
534 * given document should be put in the MSet.
536 * @return A Xapian::MSet object containing the results of the
539 * @exception Xapian::InvalidArgumentError See class documentation.
541 MSet
get_mset(Xapian::doccount first
, Xapian::doccount maxitems
,
542 Xapian::doccount checkatleast
= 0,
543 const RSet
* omrset
= 0,
544 const MatchDecider
* mdecider
= 0) const;
546 /** Get (a portion of) the match set for the current query.
548 * @param first the first item in the result set to return.
549 * A value of zero corresponds to the first item
550 * returned being that with the highest score.
551 * A value of 10 corresponds to the first 10 items
552 * being ignored, and the returned items starting
554 * @param maxitems the maximum number of items to return. If you
555 * want all matches, then you can pass the result
556 * of calling get_doccount() on the Database object
557 * (though if you are doing this so you can filter
558 * results, you are likely to get much better
559 * performance by using Xapian's match-time filtering
560 * features instead). You can pass 0 for maxitems
561 * which will give you an empty MSet with valid
562 * statistics (such as get_matches_estimated())
563 * calculated without looking at any postings, which
564 * is very quick, but means the estimates may be
565 * more approximate and the bounds may be much
567 * @param omrset the relevance set to use when performing the query.
568 * @param mdecider a decision functor to use to decide whether a
569 * given document should be put in the MSet.
571 * @return A Xapian::MSet object containing the results of the
574 * @exception Xapian::InvalidArgumentError See class documentation.
576 MSet
get_mset(Xapian::doccount first
, Xapian::doccount maxitems
,
578 const MatchDecider
* mdecider
= 0) const {
579 return get_mset(first
, maxitems
, 0, omrset
, mdecider
);
582 /** Terms in the query may be returned by get_eset().
584 * The original intended use for Enquire::get_eset() is for query
585 * expansion - suggesting terms to add to the query, generally with
586 * the aim of improving recall (i.e. finding more of the relevant
587 * documents), so by default terms already in the query won't be
588 * returned in the ESet. For some uses you might want to consider
589 * all terms, and this flag allows you to specify that.
591 static const int INCLUDE_QUERY_TERMS
= 1;
593 /** Calculate exact term frequencies in get_eset().
595 * By default, when working over multiple databases,
596 * Enquire::get_eset() uses an approximation to the termfreq to
597 * improve efficiency. This should still return good results, but
598 * if you want to calculate the exact combined termfreq then you
601 static const int USE_EXACT_TERMFREQ
= 2;
603 /** Get the expand set for the given rset.
605 * @param maxitems the maximum number of items to return.
606 * @param omrset the relevance set to use when performing
607 * the expand operation.
608 * @param flags zero or more of these values |-ed together:
609 * - Xapian::Enquire::INCLUDE_QUERY_TERMS query
610 * terms may be returned from expand
611 * - Xapian::Enquire::USE_EXACT_TERMFREQ for multi
612 * dbs, calculate the exact termfreq; otherwise an
613 * approximation is used which can greatly improve
614 * efficiency, but still returns good results.
615 * @param edecider a decision functor to use to decide whether a
616 * given term should be put in the ESet
617 * @param min_wt the minimum weight for included terms
619 * @return An ESet object containing the results of the
622 * @exception Xapian::InvalidArgumentError See class documentation.
624 ESet
get_eset(Xapian::termcount maxitems
,
627 const Xapian::ExpandDecider
* edecider
= 0,
628 double min_wt
= 0.0) const;
630 /** Get the expand set for the given rset.
632 * @param maxitems the maximum number of items to return.
633 * @param omrset the relevance set to use when performing
634 * the expand operation.
635 * @param edecider a decision functor to use to decide whether a
636 * given term should be put in the ESet
638 * @return An ESet object containing the results of the
641 * @exception Xapian::InvalidArgumentError See class documentation.
643 ESet
get_eset(Xapian::termcount maxitems
, const RSet
& omrset
,
644 const Xapian::ExpandDecider
* edecider
) const {
645 return get_eset(maxitems
, omrset
, 0, edecider
);
648 /** Get the expand set for the given rset.
650 * @param maxitems the maximum number of items to return.
651 * @param rset the relevance set to use when performing
652 * the expand operation.
653 * @param flags zero or more of these values |-ed together:
654 * - Xapian::Enquire::INCLUDE_QUERY_TERMS query
655 * terms may be returned from expand
656 * - Xapian::Enquire::USE_EXACT_TERMFREQ for multi
657 * dbs, calculate the exact termfreq; otherwise an
658 * approximation is used which can greatly improve
659 * efficiency, but still returns good results.
660 * @param k the parameter k in the query expansion algorithm
662 * @param edecider a decision functor to use to decide whether a
663 * given term should be put in the ESet
665 * @param min_wt the minimum weight for included terms
667 * @return An ESet object containing the results of the
670 * @exception Xapian::InvalidArgumentError See class documentation.
672 XAPIAN_DEPRECATED(ESet
get_eset(Xapian::termcount maxitems
,
676 const Xapian::ExpandDecider
* edecider
= NULL
,
677 double min_wt
= 0.0) const) {
678 set_expansion_scheme("trad", k
);
679 return get_eset(maxitems
, rset
, flags
, edecider
, min_wt
);
682 /** Get terms which match a given document, by document id.
684 * This method returns the terms in the current query which match
685 * the given document.
687 * It is possible for the document to have been removed from the
688 * database between the time it is returned in an MSet, and the
689 * time that this call is made. If possible, you should specify
690 * an MSetIterator instead of a Xapian::docid, since this will enable
691 * database backends with suitable support to prevent this
694 * Note that a query does not need to have been run in order to
697 * @param did The document id for which to retrieve the matching
700 * @return An iterator returning the terms which match the
701 * document. The terms will be returned (as far as this
702 * makes any sense) in the same order as the terms
703 * in the query. Terms will not occur more than once,
704 * even if they do in the query.
706 * @exception Xapian::InvalidArgumentError See class documentation.
707 * @exception Xapian::DocNotFoundError The document specified
708 * could not be found in the database.
710 TermIterator
get_matching_terms_begin(Xapian::docid did
) const;
712 /** End iterator corresponding to get_matching_terms_begin() */
713 TermIterator
XAPIAN_NOTHROW(get_matching_terms_end(Xapian::docid
/*did*/) const) {
714 return TermIterator();
717 /** Get terms which match a given document, by match set item.
719 * This method returns the terms in the current query which match
720 * the given document.
722 * If the underlying database has suitable support, using this call
723 * (rather than passing a Xapian::docid) will enable the system to
724 * ensure that the correct data is returned, and that the document
725 * has not been deleted or changed since the query was performed.
727 * @param it The iterator for which to retrieve the matching terms.
729 * @return An iterator returning the terms which match the
730 * document. The terms will be returned (as far as this
731 * makes any sense) in the same order as the terms
732 * in the query. Terms will not occur more than once,
733 * even if they do in the query.
735 * @exception Xapian::InvalidArgumentError See class documentation.
736 * @exception Xapian::DocNotFoundError The document specified
737 * could not be found in the database.
739 TermIterator
get_matching_terms_begin(const MSetIterator
&it
) const;
741 /** End iterator corresponding to get_matching_terms_begin() */
742 TermIterator
XAPIAN_NOTHROW(get_matching_terms_end(const MSetIterator
&/*it*/) const) {
743 return TermIterator();
746 /// Return a string describing this object.
747 std::string
get_description() const;
752 #endif /* XAPIAN_INCLUDED_ENQUIRE_H */