3 * @brief build a Xapian::Query object from a user query string
5 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016,2018,2019 Olly Betts
6 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
7 * Copyright (C) 2010 Adam Sjøgren
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "queryparser_internal.h"
29 #include "api/queryinternal.h"
32 #include "stringutils.h"
33 #include "xapian/error.h"
34 #include "xapian/unicode.h"
36 // Include the list of token values lemon generates.
37 #include "queryparser_token.h"
39 #include "cjk-tokenizer.h"
48 // We create the yyParser on the stack.
49 #define Parse_ENGINEALWAYSONSTACK
53 using namespace Xapian;
56 U_isupper(unsigned ch) {
57 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
61 U_isdigit(unsigned ch) {
62 return ch < 128 && C_isdigit(static_cast<unsigned char>(ch));
66 U_isalpha(unsigned ch) {
67 return ch < 128 && C_isalpha(static_cast<unsigned char>(ch));
70 using Xapian::Unicode::is_whitespace;
73 is_not_whitespace(unsigned ch) {
74 return !is_whitespace(ch);
77 using Xapian::Unicode::is_wordchar;
80 is_not_wordchar(unsigned ch) {
81 return !is_wordchar(ch);
85 is_digit(unsigned ch) {
86 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
89 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
90 // and there's the risk of hyphens getting stuck onto the end of terms...
92 is_suffix(unsigned ch) {
93 return ch == '+' || ch == '#';
97 is_double_quote(unsigned ch) {
98 // We simply treat all double quotes as equivalent, which is a bit crude,
99 // but it isn't clear that it would actually better to require them to
102 // 0x201c is Unicode opening double quote.
103 // 0x201d is Unicode closing double quote.
104 return ch == '"' || ch == 0x201c || ch == 0x201d;
108 prefix_needs_colon(const string & prefix, unsigned ch)
110 if (!U_isupper(ch) && ch != ':') return false;
111 string::size_type len = prefix.length();
112 return (len > 1 && prefix[len - 1] != ':');
115 using Unicode::is_currency;
118 is_positional(Xapian::Query::op op)
120 return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
125 /** Class used to pass information about a token from lexer to parser.
127 * Generally an instance of this class carries term information, but it can be
128 * used for a range query, and with some operators (e.g. the distance in
129 * NEAR/3 or ADJ/3, etc).
136 const FieldInfo * field_info;
138 QueryParser::stem_strategy stem;
142 Term(const string &name_, termpos pos_)
143 : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
144 explicit Term(const string &name_)
145 : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
146 Term(const string &name_, const FieldInfo * field_info_)
147 : name(name_), field_info(field_info_),
148 stem(QueryParser::STEM_NONE), pos(0) { }
149 explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
150 Term(State * state_, const string &name_, const FieldInfo * field_info_,
151 const string &unstemmed_,
152 QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
154 : state(state_), name(name_), field_info(field_info_),
155 unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
157 Term(const Xapian::Query & q, const string & grouping)
158 : name(grouping), query(q) { }
160 string make_term(const string & prefix) const;
162 void need_positions() {
163 if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
166 termpos get_termpos() const { return pos; }
168 string get_grouping() const {
169 return field_info->grouping;
172 Query * as_wildcarded_query(State * state) const;
174 /** Build a query for a term at the very end of the query string when
175 * FLAG_PARTIAL is in use.
177 * This query should match documents containing any terms which start with
178 * the characters specified, but should give a higher score to exact
179 * matches (since the user might have finished typing - we simply don't
182 Query * as_partial_query(State * state_) const;
184 /** Build a query for a string of CJK characters. */
185 Query * as_cjk_query() const;
187 /** Handle a CJK character string in a positional context. */
188 void as_positional_cjk_term(Terms * terms) const;
191 Query as_range_query() const;
193 Query get_query() const;
195 Query get_query_with_synonyms() const;
197 Query get_query_with_auto_synonyms() const;
200 /// Parser State shared between the lexer and the parser.
202 QueryParser::Internal * qpi;
206 const char* error = NULL;
208 Query::op effective_default_op;
210 State(QueryParser::Internal * qpi_, unsigned flags_)
211 : qpi(qpi_), flags(flags_), effective_default_op(qpi_->default_op)
213 if ((flags & QueryParser::FLAG_NO_POSITIONS)) {
214 if (is_positional(effective_default_op)) {
215 effective_default_op = Query::OP_AND;
220 string stem_term(const string &term) {
221 return qpi->stemmer(term);
224 void add_to_stoplist(const Term * term) {
225 qpi->stoplist.push_back(term->name);
228 void add_to_unstem(const string & term, const string & unstemmed) {
229 qpi->unstem.insert(make_pair(term, unstemmed));
232 Term * range(const string &a, const string &b) {
233 for (auto i : qpi->rangeprocs) {
234 Xapian::Query range_query = (i.proc)->check_range(a, b);
235 Xapian::Query::op op = range_query.get_type();
237 case Xapian::Query::OP_INVALID:
239 case Xapian::Query::OP_VALUE_RANGE:
240 case Xapian::Query::OP_VALUE_GE:
241 case Xapian::Query::OP_VALUE_LE:
242 if (i.default_grouping) {
243 Xapian::Internal::QueryValueBase * base =
244 static_cast<Xapian::Internal::QueryValueBase*>(
245 range_query.internal.get());
246 Xapian::valueno slot = base->get_slot();
247 return new Term(range_query, str(slot));
250 case Xapian::Query::LEAF_TERM:
251 return new Term(range_query, i.grouping);
253 return new Term(range_query, string());
259 Query::op default_op() const {
260 return effective_default_op;
263 bool is_stopword(const Term *term) const {
264 return qpi->stopper.get() && (*qpi->stopper)(term->name);
267 Database get_database() const {
271 const Stopper * get_stopper() const {
272 return qpi->stopper.get();
275 size_t stoplist_size() const {
276 return qpi->stoplist.size();
279 void stoplist_resize(size_t s) {
280 qpi->stoplist.resize(s);
283 Xapian::termcount get_max_wildcard_expansion() const {
284 return qpi->max_wildcard_expansion;
287 int get_max_wildcard_type() const {
288 return qpi->max_wildcard_type;
291 Xapian::termcount get_max_partial_expansion() const {
292 return qpi->max_partial_expansion;
295 int get_max_partial_type() const {
296 return qpi->max_partial_type;
301 Term::make_term(const string & prefix) const
304 if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
306 if (!prefix.empty()) {
308 if (prefix_needs_colon(prefix, name[0])) term += ':';
310 if (stem != QueryParser::STEM_NONE) {
311 term += state->stem_term(name);
316 if (!unstemmed.empty())
317 state->add_to_unstem(term, unstemmed);
321 // Iterator shim to allow building a synonym query from a TermIterator pair.
322 class SynonymIterator {
323 Xapian::TermIterator i;
327 const Xapian::Query * first;
330 SynonymIterator(const Xapian::TermIterator & i_,
331 Xapian::termpos pos_ = 0,
332 const Xapian::Query * first_ = NULL)
333 : i(i_), pos(pos_), first(first_) { }
335 SynonymIterator & operator++() {
343 const Xapian::Query operator*() const {
344 if (first) return *first;
345 return Xapian::Query(*i, 1, pos);
348 bool operator==(const SynonymIterator & o) const {
349 return i == o.i && first == o.first;
352 bool operator!=(const SynonymIterator & o) const {
353 return !(*this == o);
356 typedef std::input_iterator_tag iterator_category;
357 typedef Xapian::Query value_type;
358 typedef Xapian::termcount_diff difference_type;
359 typedef Xapian::Query * pointer;
360 typedef Xapian::Query & reference;
364 Term::get_query_with_synonyms() const
366 // Handle single-word synonyms with each prefix.
367 const auto& prefixes = field_info->prefixes;
368 if (prefixes.empty()) {
369 Assert(field_info->proc.get());
370 return (*field_info->proc)(name);
373 Query q = get_query();
375 for (auto&& prefix : prefixes) {
376 // First try the unstemmed term:
378 if (!prefix.empty()) {
380 if (prefix_needs_colon(prefix, name[0])) term += ':';
384 Xapian::Database db = state->get_database();
385 Xapian::TermIterator syn = db.synonyms_begin(term);
386 Xapian::TermIterator end = db.synonyms_end(term);
387 if (syn == end && stem != QueryParser::STEM_NONE) {
388 // If that has no synonyms, try the stemmed form:
390 if (!prefix.empty()) {
392 if (prefix_needs_colon(prefix, name[0])) term += ':';
394 term += state->stem_term(name);
395 syn = db.synonyms_begin(term);
396 end = db.synonyms_end(term);
398 q = Query(q.OP_SYNONYM,
399 SynonymIterator(syn, pos, &q),
400 SynonymIterator(end));
406 Term::get_query_with_auto_synonyms() const
408 const unsigned MASK_ENABLE_AUTO_SYNONYMS =
409 QueryParser::FLAG_AUTO_SYNONYMS |
410 QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
411 if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
412 return get_query_with_synonyms();
418 add_to_query(Query *& q, Query::op op, Query * term)
422 if (op == Query::OP_OR) {
424 } else if (op == Query::OP_AND) {
427 *q = Query(op, *q, *term);
436 add_to_query(Query *& q, Query::op op, const Query & term)
439 if (op == Query::OP_OR) {
441 } else if (op == Query::OP_AND) {
444 *q = Query(op, *q, term);
452 Term::get_query() const
454 const auto& prefixes = field_info->prefixes;
455 if (prefixes.empty()) {
456 Assert(field_info->proc.get());
457 return (*field_info->proc)(name);
459 auto piter = prefixes.begin();
460 Query q(make_term(*piter), 1, pos);
461 while (++piter != prefixes.end()) {
462 q |= Query(make_term(*piter), 1, pos);
468 Term::as_wildcarded_query(State * state_) const
470 const auto& prefixes = field_info->prefixes;
471 Xapian::termcount max = state_->get_max_wildcard_expansion();
472 int max_type = state_->get_max_wildcard_type();
474 subqs.reserve(prefixes.size());
475 for (string root : prefixes) {
477 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
478 subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
481 Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
487 Term::as_partial_query(State * state_) const
489 Xapian::termcount max = state_->get_max_partial_expansion();
490 int max_type = state_->get_max_partial_type();
491 vector<Query> subqs_partial; // A synonym of all the partial terms.
492 vector<Query> subqs_full; // A synonym of all the full terms.
494 for (const string& prefix : field_info->prefixes) {
495 string root = prefix;
497 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
498 subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
500 // Add the term, as it would normally be handled, as an alternative.
501 subqs_full.push_back(Query(make_term(prefix), 1, pos));
503 Query * q = new Query(Query::OP_OR,
504 Query(Query::OP_SYNONYM,
505 subqs_partial.begin(), subqs_partial.end()),
506 Query(Query::OP_SYNONYM,
507 subqs_full.begin(), subqs_full.end()));
513 Term::as_cjk_query() const
515 vector<Query> prefix_subqs;
516 vector<Query> cjk_subqs;
517 const auto& prefixes = field_info->prefixes;
518 for (const string& prefix : prefixes) {
519 for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
520 cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
522 prefix_subqs.push_back(Query(Query::OP_AND,
523 cjk_subqs.begin(), cjk_subqs.end()));
526 Query * q = new Query(Query::OP_OR,
527 prefix_subqs.begin(), prefix_subqs.end());
533 Term::as_range_query() const
541 is_phrase_generator(unsigned ch)
543 // These characters generate a phrase search.
544 // Ordered mostly by frequency of calls to this function done when
545 // running the testcases in api_queryparser.cc.
546 return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
550 is_stem_preventer(unsigned ch)
552 return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
556 should_stem(const string & term)
558 const unsigned int SHOULD_STEM_MASK =
559 (1 << Unicode::LOWERCASE_LETTER) |
560 (1 << Unicode::TITLECASE_LETTER) |
561 (1 << Unicode::MODIFIER_LETTER) |
562 (1 << Unicode::OTHER_LETTER);
563 Utf8Iterator u(term);
564 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
567 /** Value representing "ignore this" when returned by check_infix() or
568 * check_infix_digit().
570 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
572 inline unsigned check_infix(unsigned ch) {
573 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
574 // Unicode includes all these except '&' in its word boundary rules,
575 // as well as 0x2019 (which we handle below) and ':' (for Swedish
576 // apparently, but we ignore this for now as it's problematic in
577 // real world cases).
581 // 0x2019 is Unicode apostrophe and single closing quote.
582 // 0x201b is Unicode single opening quote with the tail rising.
583 if (ch == 0x2019 || ch == 0x201b)
585 if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
586 return UNICODE_IGNORE;
591 inline unsigned check_infix_digit(unsigned ch) {
592 // This list of characters comes from Unicode's word identifying algorithm.
597 case 0x037e: // GREEK QUESTION MARK
598 case 0x0589: // ARMENIAN FULL STOP
599 case 0x060D: // ARABIC DATE SEPARATOR
600 case 0x07F8: // NKO COMMA
601 case 0x2044: // FRACTION SLASH
602 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
603 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
604 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
607 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
608 return UNICODE_IGNORE;
612 // Prototype a function lemon generates, but which we want to call before that
613 // in the generated source code file.
615 static void yy_parse_failed(yyParser *);
618 QueryParser::Internal::add_prefix(const string &field, const string &prefix)
620 map<string, FieldInfo>::iterator p = field_map.find(field);
621 if (p == field_map.end()) {
622 field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
624 // Check that this is the same type of filter as the existing one(s).
625 if (p->second.type != NON_BOOLEAN) {
626 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
628 if (p->second.proc.get())
629 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
630 p->second.prefixes.push_back(prefix);
635 QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
637 map<string, FieldInfo>::iterator p = field_map.find(field);
638 if (p == field_map.end()) {
639 field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
641 // Check that this is the same type of filter as the existing one(s).
642 if (p->second.type != NON_BOOLEAN) {
643 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
645 if (!p->second.prefixes.empty())
646 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
647 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
652 QueryParser::Internal::add_boolean_prefix(const string &field,
653 const string &prefix,
654 const string* grouping)
656 // Don't allow the empty prefix to be set as boolean as it doesn't
657 // really make sense.
659 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
660 if (!grouping) grouping = &field;
661 filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
662 map<string, FieldInfo>::iterator p = field_map.find(field);
663 if (p == field_map.end()) {
664 field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
666 // Check that this is the same type of filter as the existing one(s).
667 if (p->second.type != type) {
668 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
670 if (p->second.proc.get())
671 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
672 p->second.prefixes.push_back(prefix); // FIXME grouping
677 QueryParser::Internal::add_boolean_prefix(const string &field,
678 FieldProcessor *proc,
679 const string* grouping)
681 // Don't allow the empty prefix to be set as boolean as it doesn't
682 // really make sense.
684 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
685 if (!grouping) grouping = &field;
686 filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
687 map<string, FieldInfo>::iterator p = field_map.find(field);
688 if (p == field_map.end()) {
689 field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
691 // Check that this is the same type of filter as the existing one(s).
692 if (p->second.type != type) {
693 throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
695 if (!p->second.prefixes.empty())
696 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
697 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
702 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
703 bool cjk_ngram, bool & is_cjk_term,
707 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
708 // Don't worry if there's a trailing '.' or not.
709 if (U_isupper(*it)) {
713 Unicode::append_utf8(t, *p++);
714 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
715 // One letter does not make an acronym! If we handled a single
716 // uppercase letter here, we wouldn't catch M&S below.
717 if (t.length() > 1) {
718 // Check there's not a (lower case) letter or digit
719 // immediately after it.
720 // FIXME: should I.B.M..P.T.O be a range search?
721 if (p == end || !is_wordchar(*p)) {
727 was_acronym = !term.empty();
729 if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
730 const char* cjk = it.raw();
732 term.assign(cjk, it.raw() - cjk);
737 unsigned prevch = *it;
738 Unicode::append_utf8(term, prevch);
739 while (++it != end) {
740 if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
742 if (!is_wordchar(ch)) {
743 // Treat a single embedded '&' or "'" or similar as a word
744 // character (e.g. AT&T, Fred's). Also, normalise
745 // apostrophes to ASCII apostrophe.
748 if (p == end || !is_wordchar(*p)) break;
749 unsigned nextch = *p;
750 if (is_digit(prevch) && is_digit(nextch)) {
751 ch = check_infix_digit(ch);
753 ch = check_infix(ch);
756 if (ch == UNICODE_IGNORE)
759 Unicode::append_utf8(term, ch);
762 if (it != end && is_suffix(*it)) {
763 string suff_term = term;
765 // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
767 if (suff_term.size() - term.size() == 3) {
772 } while (is_suffix(*++p));
773 if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
774 // If the suffixed term doesn't exist, check that the
775 // non-suffixed term does. This also takes care of
776 // the case when QueryParser::set_database() hasn't
778 bool use_suff_term = false;
779 string lc = Unicode::tolower(suff_term);
780 if (db.term_exists(lc)) {
781 use_suff_term = true;
783 lc = Unicode::tolower(term);
784 if (!db.term_exists(lc)) use_suff_term = true;
797 // Switch to %code to insert at the end of the file so struct yyParser has been
802 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
803 const string &default_prefix)
805 bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
807 // Set ranges if we may have to handle ranges in the query.
808 bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
810 termpos term_pos = 1;
811 Utf8Iterator it(qs), end;
813 State state(this, flags);
815 // To successfully apply more than one spelling correction to a query
816 // string, we must keep track of the offset due to previous corrections.
817 int correction_offset = 0;
818 corrected_query.resize(0);
820 // Stack of prefixes, used for phrases and subexpressions.
821 list<const FieldInfo *> prefix_stack;
823 // If default_prefix is specified, use it. Otherwise, use any list
824 // that has been set for the empty prefix.
825 const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
827 const FieldInfo * default_field_info = &def_pfx;
828 if (default_prefix.empty()) {
829 auto f = field_map.find(string());
830 if (f != field_map.end()) default_field_info = &(f->second);
833 // We always have the current prefix on the top of the stack.
834 prefix_stack.push_back(default_field_info);
839 unsigned newprev = ' ';
842 DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
843 IN_GROUP2, EXPLICIT_SYNONYM
845 while (it != end && !state.error) {
846 bool last_was_operator = false;
847 bool last_was_operator_needing_term = false;
848 if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
851 if (it == end) break;
853 last_was_operator_needing_term = false;
854 last_was_operator = true;
857 just_had_operator_needing_term:
858 last_was_operator_needing_term = true;
859 last_was_operator = true;
861 if (mode == IN_PHRASED_TERM) mode = DEFAULT;
862 if (is_whitespace(*it)) {
865 it = find_if(it, end, is_not_whitespace);
866 if (it == end) break;
870 (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
871 // Scan forward to see if this could be the "start of range"
872 // token. Sadly this has O(n²) tendencies, though at least
873 // "n" is the number of words in a query which is likely to
874 // remain fairly small. FIXME: can we tokenise more elegantly?
875 Utf8Iterator it_initial = it;
879 if (ch == '.' && *p == '.') {
882 Unicode::append_utf8(a, *it++);
884 // Trim off the trailing ".".
885 a.resize(a.size() - 1);
887 // Either end of the range can be empty (for an open-ended
888 // range) but both can't be empty.
889 if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
891 // Allow any character except whitespace and ')' in the
893 while (p != end && *p > ' ' && *p != ')') {
894 Unicode::append_utf8(b, *p++);
896 Term * range = state.range(a, b);
898 state.error = "Unknown range operation";
899 if (a.find(':', 1) == string::npos) {
902 // Might be a boolean filter with ".." in. Leave
903 // state.error in case it isn't.
907 Parse(&parser, RANGE, range, &state);
913 // Allow any character except whitespace and '(' in the lower
915 if (ch <= ' ' || ch == '(') break;
920 if (!is_wordchar(*it)) {
921 unsigned prev = newprev;
924 // Drop out of IN_GROUP mode.
925 if (mode == IN_GROUP || mode == IN_GROUP2)
929 case 0x201c: // Left curly double quote.
930 case 0x201d: // Right curly double quote.
932 if (mode == DEFAULT) {
934 it = find_if(it, end, is_not_whitespace);
936 // Ignore an unmatched " at the end of the query to
937 // avoid generating an empty pair of QUOTEs which will
938 // cause a parse error.
941 if (is_double_quote(*it)) {
942 // Ignore empty "" (but only if we're not already
943 // IN_QUOTES as we don't merge two adjacent quoted
949 if (flags & QueryParser::FLAG_PHRASE) {
950 if (ch == '"' && it != end && *it == '"') {
952 // Handle "" inside a quoted phrase as an escaped " for
953 // consistency with quoted boolean terms.
956 Parse(&parser, QUOTE, NULL, &state);
957 if (mode == DEFAULT) {
960 // Remove the prefix we pushed for this phrase.
961 if (mode == IN_PREFIXED_QUOTES)
962 prefix_stack.pop_back();
968 case '+': case '-': // Loved or hated term/phrase/subexpression.
969 // Ignore + or - at the end of the query string.
970 if (it == end) goto done;
971 if (prev > ' ' && prev != '(') {
972 // Or if not after whitespace or an open bracket.
975 if (is_whitespace(*it) || *it == '+' || *it == '-') {
976 // Ignore + or - followed by a space, or further + or -.
977 // Postfix + (such as in C++ and H+) is handled as part of
978 // the term lexing code in parse_term().
982 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
986 } else if (last_was_operator) {
987 token = HATE_AFTER_AND;
991 Parse(&parser, token, NULL, &state);
992 goto just_had_operator_needing_term;
994 // Need to prevent the term after a LOVE or HATE starting a
998 case '(': // Bracketed subexpression.
1000 it = find_if(it, end, is_not_whitespace);
1001 // Ignore ( at the end of the query string.
1002 if (it == end) goto done;
1003 if (prev > ' ' && strchr("()+-", prev) == NULL) {
1004 // Or if not after whitespace or a bracket or '+' or '-'.
1012 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1013 prefix_stack.push_back(prefix_stack.back());
1014 Parse(&parser, BRA, NULL, &state);
1018 case ')': // End of bracketed subexpression.
1019 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1020 // Remove the prefix we pushed for the corresponding BRA.
1021 // If brackets are unmatched, it's a syntax error, but
1022 // that's no excuse to SEGV!
1023 if (prefix_stack.size() > 1) prefix_stack.pop_back();
1024 Parse(&parser, KET, NULL, &state);
1028 case '~': // Synonym expansion.
1029 // Ignore at the end of the query string.
1030 if (it == end) goto done;
1031 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1032 if (prev > ' ' && strchr("+-(", prev) == NULL) {
1033 // Or if not after whitespace, +, -, or an open bracket.
1036 if (!is_wordchar(*it)) {
1037 // Ignore if not followed by a word character.
1040 Parse(&parser, SYNONYM, NULL, &state);
1041 mode = EXPLICIT_SYNONYM;
1042 goto just_had_operator_needing_term;
1046 // Skip any other characters.
1050 Assert(is_wordchar(*it));
1052 size_t term_start_index = it.raw() - qs.data();
1054 newprev = 'A'; // Any letter will do...
1056 // A term, a prefix, or a boolean operator.
1057 const FieldInfo * field_info = NULL;
1058 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1059 !field_map.empty()) {
1060 // Check for a fieldname prefix (e.g. title:historical).
1061 Utf8Iterator p = find_if(it, end, is_not_wordchar);
1062 if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1066 Unicode::append_utf8(field, *p++);
1067 map<string, FieldInfo>::const_iterator f;
1068 f = field_map.find(field);
1069 if (f != field_map.end()) {
1070 // Special handling for prefixed fields, depending on the
1071 // type of the prefix.
1073 field_info = &(f->second);
1075 if (field_info->type != NON_BOOLEAN) {
1076 // Drop out of IN_GROUP if we're in it.
1077 if (mode == IN_GROUP || mode == IN_GROUP2)
1081 if (it != end && is_double_quote(*it)) {
1082 // Quoted boolean term (can contain any character).
1083 bool fancy = (*it != '"');
1087 // Interpret "" as an escaped ".
1088 if (++it == end || *it != '"')
1090 } else if (fancy && is_double_quote(*it)) {
1091 // If the opening quote was ASCII, then the
1092 // closing one must be too - otherwise
1093 // the user can't protect non-ASCII double
1094 // quote characters by quoting or escaping.
1098 Unicode::append_utf8(name, *it++);
1101 // Can't boolean filter prefix a subexpression, so
1102 // just use anything following the prefix until the
1103 // next space or ')' as part of the boolean filter
1105 while (it != end && *it > ' ' && *it != ')')
1106 Unicode::append_utf8(name, *it++);
1108 // Build the unstemmed form in field.
1111 // Clear any pending range error.
1113 Term * token = new Term(&state, name, field_info, field);
1114 Parse(&parser, BOOLEAN_FILTER, token, &state);
1118 if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1119 // Prefixed phrase, e.g.: subject:"space flight"
1120 mode = IN_PREFIXED_QUOTES;
1121 Parse(&parser, QUOTE, NULL, &state);
1125 prefix_stack.push_back(field_info);
1129 if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1130 // Prefixed subexpression, e.g.: title:(fast NEAR food)
1132 Parse(&parser, BRA, NULL, &state);
1136 prefix_stack.push_back(field_info);
1141 // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1142 while (is_phrase_generator(ch)) {
1149 if (is_wordchar(ch)) {
1154 // It looks like a prefix but isn't, so parse it as
1164 bool is_cjk_term = false;
1165 string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
1167 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1168 (flags & FLAG_BOOLEAN) &&
1169 // Don't want to interpret A.N.D. as an AND operator.
1172 term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1173 // Boolean operators.
1175 if (flags & FLAG_BOOLEAN_ANY_CASE) {
1176 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1180 if (op.size() == 3) {
1182 Parse(&parser, AND, NULL, &state);
1183 goto just_had_operator;
1186 Parse(&parser, NOT, NULL, &state);
1187 goto just_had_operator;
1190 Parse(&parser, XOR, NULL, &state);
1191 goto just_had_operator;
1194 if (it != end && *it == '/') {
1196 Utf8Iterator p = it;
1197 while (++p != end && U_isdigit(*p)) {
1198 width = (width * 10) + (*p - '0');
1200 if (width && (p == end || is_whitespace(*p))) {
1202 Parse(&parser, ADJ, new Term(width), &state);
1203 goto just_had_operator;
1206 Parse(&parser, ADJ, NULL, &state);
1207 goto just_had_operator;
1210 } else if (op.size() == 2) {
1212 Parse(&parser, OR, NULL, &state);
1213 goto just_had_operator;
1215 } else if (op.size() == 4) {
1217 if (it != end && *it == '/') {
1219 Utf8Iterator p = it;
1220 while (++p != end && U_isdigit(*p)) {
1221 width = (width * 10) + (*p - '0');
1223 if (width && (p == end || is_whitespace(*p))) {
1225 Parse(&parser, NEAR, new Term(width), &state);
1226 goto just_had_operator;
1229 Parse(&parser, NEAR, NULL, &state);
1230 goto just_had_operator;
1236 // If no prefix is set, use the default one.
1237 if (!field_info) field_info = prefix_stack.back();
1239 Assert(field_info->type == NON_BOOLEAN);
1242 string unstemmed_term(term);
1243 term = Unicode::tolower(term);
1245 // Reuse stem_strategy - STEM_SOME here means "stem terms except
1246 // when used with positional operators".
1247 stem_strategy stem_term = stem_action;
1248 if (stem_term != STEM_NONE) {
1249 if (stemmer.is_none()) {
1250 stem_term = STEM_NONE;
1251 } else if (stem_term == STEM_SOME ||
1252 stem_term == STEM_SOME_FULL_POS) {
1253 if (!should_stem(unstemmed_term) ||
1254 (it != end && is_stem_preventer(*it))) {
1255 // Don't stem this particular term.
1256 stem_term = STEM_NONE;
1261 Term * term_obj = new Term(&state, term, field_info,
1262 unstemmed_term, stem_term, term_pos++);
1265 Parse(&parser, CJKTERM, term_obj, &state);
1266 if (it == end) break;
1270 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1272 if ((flags & FLAG_WILDCARD) && *it == '*') {
1275 if (p == end || !is_wordchar(*p)) {
1277 if (mode == IN_GROUP || mode == IN_GROUP2) {
1278 // Drop out of IN_GROUP and flag that the group
1279 // can be empty if all members are stopwords.
1280 if (mode == IN_GROUP2)
1281 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1284 // Wildcard at end of term (also known as
1285 // "right truncation").
1286 Parse(&parser, WILD_TERM, term_obj, &state);
1291 if (flags & FLAG_PARTIAL) {
1292 if (mode == IN_GROUP || mode == IN_GROUP2) {
1293 // Drop out of IN_GROUP and flag that the group
1294 // can be empty if all members are stopwords.
1295 if (mode == IN_GROUP2)
1296 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1299 // Final term of a partial match query, with no
1300 // following characters - treat as a wildcard.
1301 Parse(&parser, PARTIAL_TERM, term_obj, &state);
1307 // Check spelling, if we're a normal term, and any of the prefixes
1309 if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1310 const auto& prefixes = field_info->prefixes;
1311 for (const string& prefix : prefixes) {
1312 if (!prefix.empty())
1314 const string & suggest = db.get_spelling_suggestion(term);
1315 if (!suggest.empty()) {
1316 if (corrected_query.empty()) corrected_query = qs;
1317 size_t term_end_index = it.raw() - qs.data();
1318 size_t n = term_end_index - term_start_index;
1319 size_t pos = term_start_index + correction_offset;
1320 corrected_query.replace(pos, n, suggest);
1321 correction_offset += suggest.size();
1322 correction_offset -= n;
1328 if (mode == IN_PHRASED_TERM) {
1329 Parse(&parser, PHR_TERM, term_obj, &state);
1331 // See if the next token will be PHR_TERM - if so, this one
1332 // needs to be TERM not GROUP_TERM.
1333 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1334 is_phrase_generator(*it)) {
1335 // FIXME: can we clean this up?
1336 Utf8Iterator p = it;
1339 } while (p != end && is_phrase_generator(*p));
1340 // Don't generate a phrase unless the phrase generators are
1341 // immediately followed by another term.
1342 if (p != end && is_wordchar(*p)) {
1348 if (mode == IN_GROUP || mode == IN_GROUP2) {
1352 Parse(&parser, token, term_obj, &state);
1353 if (token == TERM && mode != DEFAULT)
1358 if (it == end) break;
1360 if (is_phrase_generator(*it)) {
1361 // Skip multiple phrase generators.
1364 } while (it != end && is_phrase_generator(*it));
1365 // Don't generate a phrase unless the phrase generators are
1366 // immediately followed by another term.
1367 if (it != end && is_wordchar(*it)) {
1368 mode = IN_PHRASED_TERM;
1369 term_start_index = it.raw() - qs.data();
1372 } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1373 int old_mode = mode;
1375 if (!last_was_operator_needing_term && is_whitespace(*it)) {
1377 // Skip multiple whitespace.
1380 } while (it != end && is_whitespace(*it));
1381 // Don't generate a group unless the terms are only separated
1383 if (it != end && is_wordchar(*it)) {
1384 if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1395 // Implicitly close any unclosed quotes.
1396 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1397 Parse(&parser, QUOTE, NULL, &state);
1399 // Implicitly close all unclosed brackets.
1400 while (prefix_stack.size() > 1) {
1401 Parse(&parser, KET, NULL, &state);
1402 prefix_stack.pop_back();
1404 Parse(&parser, 0, NULL, &state);
1407 errmsg = state.error;
1415 Query* query = NULL;
1418 // filter is a map from prefix to a query for that prefix. Queries with
1419 // the same prefix are combined with OR, and the results of this are
1420 // combined with AND to get the full filter.
1421 map<string, Query> filter;
1426 ProbQuery(Query* query_) : query(query_) {}
1434 void add_filter(const string& grouping, const Query & q) {
1435 filter[grouping] = q;
1438 void append_filter(const string& grouping, const Query & qnew) {
1439 auto it = filter.find(grouping);
1440 if (it == filter.end()) {
1441 filter.insert(make_pair(grouping, qnew));
1443 Query & q = it->second;
1444 // We OR multiple filters with the same prefix if they're
1445 // exclusive, otherwise we AND them.
1446 bool exclusive = !grouping.empty();
1455 void add_filter_range(const string& grouping, const Query & range) {
1456 filter[grouping] = range;
1459 void append_filter_range(const string& grouping, const Query & range) {
1460 Query & q = filter[grouping];
1464 Query merge_filters() const {
1465 auto i = filter.begin();
1466 Assert(i != filter.end());
1467 Query q = i->second;
1468 while (++i != filter.end()) {
1475 /// A group of terms separated only by whitespace.
1477 vector<Term *> terms;
1479 /** Controls how to handle a group where all terms are stopwords.
1481 * If true, then as_group() returns NULL. If false, then the
1482 * stopword status of the terms is ignored.
1486 TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1492 /// Factory function - ensures heap allocation.
1493 static TermGroup* create(Term* t1, Term* t2) {
1494 return new TermGroup(t1, t2);
1498 for (auto&& t : terms) {
1503 /// Add a Term object to this TermGroup object.
1504 void add_term(Term * term) {
1505 terms.push_back(term);
1508 /// Set the empty_ok flag.
1509 void set_empty_ok() { empty_ok = true; }
1511 /// Convert to a Xapian::Query * using default_op.
1512 Query * as_group(State *state) const;
1516 TermGroup::as_group(State *state) const
1518 const Xapian::Stopper * stopper = state->get_stopper();
1519 size_t stoplist_size = state->stoplist_size();
1520 bool default_op_is_positional = is_positional(state->default_op());
1522 Query::op default_op = state->default_op();
1523 vector<Query> subqs;
1524 subqs.reserve(terms.size());
1525 if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1526 // Check for multi-word synonyms.
1527 Database db = state->get_database();
1530 vector<Term*>::size_type begin = 0;
1531 vector<Term*>::size_type i = begin;
1532 while (terms.size() - i > 0) {
1533 size_t longest_match = 0;
1534 // This value is never used, but GCC 4.8 warns with
1535 // -Wmaybe-uninitialized (GCC 5.4 doesn't).
1536 vector<Term*>::size_type longest_match_end = 0;
1537 if (terms.size() - i >= 2) {
1538 // Greedily try to match as many consecutive words as possible.
1539 key = terms[i]->name;
1541 key += terms[i + 1]->name;
1542 TermIterator synkey(db.synonym_keys_begin(key));
1543 TermIterator synend(db.synonym_keys_end(key));
1544 if (synkey != synend) {
1545 longest_match = key.size();
1546 longest_match_end = i + 2;
1547 for (auto j = i + 2; j < terms.size(); ++j) {
1549 key += terms[j]->name;
1550 synkey.skip_to(key);
1551 if (synkey == synend)
1553 const string& found = *synkey;
1554 if (!startswith(found, key))
1556 if (found.size() == key.size()) {
1557 longest_match = key.size();
1558 longest_match_end = j + 1;
1563 if (longest_match == 0) {
1564 // No multi-synonym matches at position i.
1565 if (stopper && (*stopper)(terms[i]->name)) {
1566 state->add_to_stoplist(terms[i]);
1568 if (default_op_is_positional)
1569 terms[i]->need_positions();
1570 subqs.push_back(terms[i]->get_query_with_auto_synonyms());
1575 i = longest_match_end;
1576 key.resize(longest_match);
1578 vector<Query> subqs2;
1579 for (auto j = begin; j != i; ++j) {
1580 if (stopper && (*stopper)(terms[j]->name)) {
1581 state->add_to_stoplist(terms[j]);
1583 if (default_op_is_positional)
1584 terms[i]->need_positions();
1585 subqs2.push_back(terms[j]->get_query());
1588 Query q_original_terms;
1589 if (default_op_is_positional) {
1590 q_original_terms = Query(default_op,
1591 subqs2.begin(), subqs2.end(),
1594 q_original_terms = Query(default_op,
1595 subqs2.begin(), subqs2.end());
1599 // Use the position of the first term for the synonyms.
1600 TermIterator syn = db.synonyms_begin(key);
1601 Query q(Query::OP_SYNONYM,
1602 SynonymIterator(syn, terms[begin]->pos, &q_original_terms),
1603 SynonymIterator(db.synonyms_end(key)));
1609 vector<Term*>::const_iterator i;
1610 for (i = terms.begin(); i != terms.end(); ++i) {
1611 if (stopper && (*stopper)((*i)->name)) {
1612 state->add_to_stoplist(*i);
1614 if (default_op_is_positional)
1615 (*i)->need_positions();
1616 subqs.push_back((*i)->get_query_with_auto_synonyms());
1621 if (!empty_ok && stopper && subqs.empty() &&
1622 stoplist_size < state->stoplist_size()) {
1623 // This group is all stopwords, so roll-back, disable stopper
1624 // temporarily, and reprocess this group.
1625 state->stoplist_resize(stoplist_size);
1631 if (!subqs.empty()) {
1632 if (default_op_is_positional) {
1633 q = new Query(default_op, subqs.begin(), subqs.end(),
1636 q = new Query(default_op, subqs.begin(), subqs.end());
1643 /// Some terms which form a positional sub-query.
1645 vector<Term *> terms;
1649 * size_t(-1) means don't use positional info (so an OP_AND query gets
1654 /** Keep track of whether the terms added all have the same list of
1655 * prefixes. If so, we'll build a set of phrases, one using each prefix.
1656 * This works around the limitation that a phrase cannot have multiple
1657 * components which are "OR" combinations of terms, but is also probably
1658 * what users expect: i.e., if a user specifies a phrase in a field, and
1659 * that field maps to multiple prefixes, the user probably wants a phrase
1660 * returned with all terms having one of those prefixes, rather than a
1661 * phrase comprised of terms with differing prefixes.
1663 bool uniform_prefixes;
1665 /** The list of prefixes of the terms added.
1666 * This will be NULL if the terms have different prefixes.
1668 const vector<string>* prefixes;
1670 Query opwindow_subq(Query::op op,
1671 const vector<Query>& v,
1672 Xapian::termcount w) const {
1673 if (op == Query::OP_AND) {
1674 return Query(op, v.begin(), v.end());
1676 return Query(op, v.begin(), v.end(), w);
1679 /// Convert to a query using the given operator and window size.
1680 Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
1681 if (window == size_t(-1)) op = Query::OP_AND;
1683 size_t n_terms = terms.size();
1684 Xapian::termcount w = w_delta + terms.size();
1685 if (uniform_prefixes) {
1687 for (auto&& prefix : *prefixes) {
1688 vector<Query> subqs;
1689 subqs.reserve(n_terms);
1690 for (Term* t : terms) {
1691 subqs.push_back(Query(t->make_term(prefix), 1, t->pos));
1693 add_to_query(q, Query::OP_OR, opwindow_subq(op, subqs, w));
1697 vector<Query> subqs;
1698 subqs.reserve(n_terms);
1699 for (Term* t : terms) {
1700 subqs.push_back(t->get_query());
1702 q = new Query(opwindow_subq(op, subqs, w));
1709 explicit Terms(bool no_pos)
1710 : window(no_pos ? size_t(-1) : 0),
1711 uniform_prefixes(true),
1715 /// Factory function - ensures heap allocation.
1716 static Terms* create(State* state) {
1717 return new Terms(state->flags & QueryParser::FLAG_NO_POSITIONS);
1721 for (auto&& t : terms) {
1726 /// Add an unstemmed Term object to this Terms object.
1727 void add_positional_term(Term * term) {
1728 const auto& term_prefixes = term->field_info->prefixes;
1729 if (terms.empty()) {
1730 prefixes = &term_prefixes;
1731 } else if (uniform_prefixes && prefixes != &term_prefixes) {
1732 if (*prefixes != term_prefixes) {
1734 uniform_prefixes = false;
1737 term->need_positions();
1738 terms.push_back(term);
1741 void adjust_window(size_t alternative_window) {
1742 if (alternative_window > window) window = alternative_window;
1745 /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
1746 Query * as_phrase_query() const {
1747 return as_opwindow_query(Query::OP_PHRASE, 0);
1750 /// Convert to a Xapian::Query * using OP_NEAR.
1751 Query * as_near_query() const {
1752 // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
1753 // means a window size of 11. For more than 2 terms, we just add one
1754 // to the window size for each extra term.
1757 return as_opwindow_query(Query::OP_NEAR, w - 1);
1760 /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
1761 Query * as_adj_query() const {
1762 // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
1763 // which means a window size of 11. For more than 2 terms, we just add
1764 // one to the window size for each extra term.
1767 return as_opwindow_query(Query::OP_PHRASE, w - 1);
1772 Term::as_positional_cjk_term(Terms * terms) const
1774 // Add each individual CJK character to the phrase.
1776 for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
1777 Unicode::append_utf8(t, *it);
1778 Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
1779 terms->add_positional_term(c);
1783 // FIXME: we want to add the n-grams as filters too for efficiency.
1788 // Helper macro to check for missing arguments to a boolean operator.
1789 #define VET_BOOL_ARGS(A, B, OP_TXT) \
1792 state->error = "Syntax: <expression> " OP_TXT " <expression>";\
1793 yy_parse_failed(yypParser);\
1800 %token_type {Term *}
1801 %token_destructor { delete $$; }
1803 %extra_argument {State * state}
1806 // If we've not already set an error message, set a default one.
1807 if (!state->error) state->error = "parse error";
1811 yy_parse_failed(yypParser);
1814 // Operators, grouped in order of increasing precedence:
1820 %left LOVE HATE HATE_AFTER_AND SYNONYM.
1822 // Destructors for terminal symbols:
1824 // TERM is a query term, including prefix (if any).
1825 %destructor TERM { delete $$; }
1827 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
1828 // is only separated by whitespace characters.
1829 %destructor GROUP_TERM { delete $$; }
1831 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
1832 // separated only by one or more phrase generator characters (hyphen and
1833 // apostrophe are common examples - see is_phrase_generator() for the list
1834 // of all punctuation which does this).
1835 %destructor PHR_TERM { delete $$; }
1837 // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
1839 %destructor WILD_TERM { delete $$; }
1841 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
1842 // we're doing "search as you type". It expands to something like WILD_TERM
1844 %destructor PARTIAL_TERM { delete $$; }
1846 // BOOLEAN_FILTER is a query term with a prefix registered using
1847 // add_boolean_prefix(). It's added to the query using an OP_FILTER operator,
1848 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
1849 %destructor BOOLEAN_FILTER { delete $$; }
1853 // query - The whole query - just an expr or nothing.
1855 // query non-terminal doesn't need a type, so just give a dummy one.
1858 query ::= expr(E). {
1859 // Save the parsed query in the State structure so we can return it.
1864 state->query = Query();
1869 // Handle a query string with no terms in.
1870 state->query = Query();
1873 // expr - A query expression.
1875 %type expr {Query *}
1876 %destructor expr { delete $$; }
1878 expr(E) ::= prob_expr(E).
1880 expr(A) ::= bool_arg(A) AND bool_arg(B). {
1881 VET_BOOL_ARGS(A, B, "AND");
1886 expr(A) ::= bool_arg(A) NOT bool_arg(B). {
1887 // 'NOT foo' -> '<alldocuments> NOT foo'
1888 if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
1889 A = new Query("", 1, 0);
1891 VET_BOOL_ARGS(A, B, "NOT");
1896 expr(A) ::= bool_arg(A) AND NOT bool_arg(B). [NOT] {
1897 VET_BOOL_ARGS(A, B, "AND NOT");
1902 expr(A) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND] {
1903 VET_BOOL_ARGS(A, B, "AND");
1908 expr(A) ::= bool_arg(A) OR bool_arg(B). {
1909 VET_BOOL_ARGS(A, B, "OR");
1914 expr(A) ::= bool_arg(A) XOR bool_arg(B). {
1915 VET_BOOL_ARGS(A, B, "XOR");
1920 // bool_arg - an argument to a boolean operator such as AND or OR.
1922 %type bool_arg {Query *}
1923 %destructor bool_arg { delete $$; }
1925 bool_arg(A) ::= expr(A).
1927 bool_arg(A) ::= . [ERROR] {
1928 // Set the argument to NULL, which enables the bool_arg-using rules in
1929 // expr above to report uses of AND, OR, etc which don't have two
1934 // prob_expr - a single compound term, or a prob.
1936 %type prob_expr {Query *}
1937 %destructor prob_expr { delete $$; }
1939 prob_expr(E) ::= prob(P). {
1942 // Handle any "+ terms".
1944 if (P->love->empty()) {
1950 add_to_query(E, Query::OP_AND_MAYBE, P->love);
1956 // Handle any boolean filters.
1957 if (!P->filter.empty()) {
1959 add_to_query(E, Query::OP_FILTER, P->merge_filters());
1961 // Make the query a boolean one.
1962 E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
1965 // Handle any "- terms".
1966 if (P->hate && !P->hate->empty()) {
1969 yy_parse_failed(yypParser);
1972 *E = Query(Query::OP_AND_NOT, *E, *P->hate);
1977 prob_expr(E) ::= term(E).
1979 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
1980 // boolean filters, and/or ranges.
1982 // Note: stop_term can also be several other things other than a simple term!
1984 %type prob {ProbQuery *}
1985 %destructor prob { delete $$; }
1987 prob(P) ::= RANGE(R). {
1988 string grouping = R->name;
1989 const Query & range = R->as_range_query();
1990 P = new ProbQuery; /*P-overwrites-R*/
1991 P->add_filter_range(grouping, range);
1994 prob(P) ::= stop_prob(P) RANGE(R). {
1995 string grouping = R->name;
1996 const Query & range = R->as_range_query();
1997 P->append_filter_range(grouping, range);
2000 prob(P) ::= stop_term(T) stop_term(U). {
2001 P = new ProbQuery(T); /*P-overwrites-T*/
2003 Query::op op = state->default_op();
2004 if (P->query && is_positional(op)) {
2005 // If default_op is OP_NEAR or OP_PHRASE, set the window size to
2006 // 11 for the first pair of terms and it will automatically grow
2007 // by one for each subsequent term.
2008 Query * subqs[2] = { P->query, U };
2009 *(P->query) = Query(op, subqs, subqs + 2, 11);
2012 add_to_query(P->query, op, U);
2017 prob(P) ::= prob(P) stop_term(T). {
2018 // If T is a stopword, there's nothing to do here.
2019 if (T) add_to_query(P->query, state->default_op(), T);
2022 prob(P) ::= LOVE term(T). {
2024 if (state->default_op() == Query::OP_AND) {
2031 prob(P) ::= stop_prob(P) LOVE term(T). {
2032 if (state->default_op() == Query::OP_AND) {
2033 /* The default op is AND, so we just put loved terms into the query
2034 * (in this case the only effect of love is to ignore the stopword
2036 add_to_query(P->query, Query::OP_AND, T);
2038 add_to_query(P->love, Query::OP_AND, T);
2042 prob(P) ::= HATE term(T). {
2047 prob(P) ::= stop_prob(P) HATE term(T). {
2048 add_to_query(P->hate, Query::OP_OR, T);
2051 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2053 P->hate = new Query(T->get_query());
2057 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2058 add_to_query(P->hate, Query::OP_OR, T->get_query());
2062 prob(P) ::= BOOLEAN_FILTER(T). {
2064 P->add_filter(T->get_grouping(), T->get_query());
2068 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2069 P->append_filter(T->get_grouping(), T->get_query());
2073 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2074 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2076 P->filter[T->get_grouping()] = T->get_query();
2080 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2081 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2082 // We OR filters with the same prefix...
2083 Query & q = P->filter[T->get_grouping()];
2084 q |= T->get_query();
2088 // stop_prob - A prob or a stop_term.
2090 %type stop_prob {ProbQuery *}
2091 %destructor stop_prob { delete $$; }
2093 stop_prob(P) ::= prob(P).
2095 stop_prob(P) ::= stop_term(T). {
2096 P = new ProbQuery(T); /*P-overwrites-T*/
2099 // stop_term - A term which should be checked against the stopword list,
2100 // or a compound_term.
2102 // If a term is loved, hated, or in a phrase, we don't want to consult the
2103 // stopword list, so stop_term isn't used there (instead term is).
2105 %type stop_term {Query *}
2106 %destructor stop_term { delete $$; }
2108 stop_term(T) ::= TERM(U). {
2109 if (state->is_stopword(U)) {
2111 state->add_to_stoplist(U);
2113 T = new Query(U->get_query_with_auto_synonyms());
2118 stop_term(T) ::= compound_term(T).
2120 // term - A term or a compound_term.
2122 %type term {Query *}
2123 %destructor term { delete $$; }
2125 term(T) ::= TERM(U). {
2126 T = new Query(U->get_query_with_auto_synonyms());
2130 term(T) ::= compound_term(T).
2132 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2133 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2134 // or without prefix).
2136 %type compound_term {Query *}
2137 %destructor compound_term { delete $$; }
2139 compound_term(T) ::= WILD_TERM(U).
2140 { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2142 compound_term(T) ::= PARTIAL_TERM(U).
2143 { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2145 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2146 { T = P->as_phrase_query(); }
2148 compound_term(T) ::= phrased_term(P).
2149 { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2151 compound_term(T) ::= group(P).
2152 { T = P->as_group(state); /*T-overwrites-P*/ }
2154 compound_term(T) ::= near_expr(P).
2155 { T = P->as_near_query(); /*T-overwrites-P*/ }
2157 compound_term(T) ::= adj_expr(P).
2158 { T = P->as_adj_query(); /*T-overwrites-P*/ }
2160 compound_term(T) ::= BRA expr(E) KET.
2163 compound_term(T) ::= SYNONYM TERM(U). {
2164 T = new Query(U->get_query_with_synonyms());
2168 compound_term(T) ::= CJKTERM(U). {
2169 { T = U->as_cjk_query(); /*T-overwrites-U*/ }
2172 // phrase - The "inside the quotes" part of a double-quoted phrase.
2174 %type phrase {Terms *}
2176 %destructor phrase { delete $$; }
2178 phrase(P) ::= TERM(T). {
2179 P = Terms::create(state);
2180 P->add_positional_term(T);
2183 phrase(P) ::= CJKTERM(T). {
2184 P = Terms::create(state);
2185 T->as_positional_cjk_term(P);
2188 phrase(P) ::= phrase(P) TERM(T). {
2189 P->add_positional_term(T);
2192 phrase(P) ::= phrase(P) CJKTERM(T). {
2193 T->as_positional_cjk_term(P);
2196 // phrased_term - A phrased term works like a single term, but is actually
2197 // 2 or more terms linked together into a phrase by punctuation. There must be
2198 // at least 2 terms in order to be able to have punctuation between the terms!
2200 %type phrased_term {Terms *}
2201 %destructor phrased_term { delete $$; }
2203 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2204 P = Terms::create(state);
2205 P->add_positional_term(T);
2206 P->add_positional_term(U);
2209 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2210 P->add_positional_term(T);
2213 // group - A group of terms separated only by whitespace - candidates for
2214 // multi-term synonyms.
2216 %type group {TermGroup *}
2217 %destructor group { delete $$; }
2219 group(P) ::= TERM(T) GROUP_TERM(U). {
2220 P = TermGroup::create(T, U); /*P-overwrites-T*/
2223 group(P) ::= group(P) GROUP_TERM(T). {
2227 group(P) ::= group(P) EMPTY_GROUP_OK. {
2231 // near_expr - 2 or more terms with NEAR in between. There must be at least 2
2232 // terms in order for there to be any NEAR operators!
2234 %type near_expr {Terms *}
2235 %destructor near_expr { delete $$; }
2237 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2238 P = Terms::create(state);
2239 P->add_positional_term(T);
2240 P->add_positional_term(U);
2242 P->adjust_window(N->get_termpos());
2247 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2248 P->add_positional_term(T);
2250 P->adjust_window(N->get_termpos());
2255 // adj_expr - 2 or more terms with ADJ in between. There must be at least 2
2256 // terms in order for there to be any ADJ operators!
2258 %type adj_expr {Terms *}
2259 %destructor adj_expr { delete $$; }
2261 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2262 P = Terms::create(state);
2263 P->add_positional_term(T);
2264 P->add_positional_term(U);
2266 P->adjust_window(N->get_termpos());
2271 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2272 P->add_positional_term(T);
2274 P->adjust_window(N->get_termpos());
2279 // Select lemon syntax highlighting in vim editor: vim: syntax=lemon