3 * @brief build a Xapian::Query object from a user query string
5 /* Copyright (C) 2004-2024 Olly Betts
6 * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
7 * Copyright (C) 2010 Adam Sjøgren
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "queryparser_internal.h"
29 #include "api/queryinternal.h"
32 #include "stringutils.h"
33 #include "xapian/error.h"
34 #include "xapian/unicode.h"
36 // Include the list of token values lemon generates.
37 #include "queryparser_token.h"
39 #include "word-breaker.h"
46 #include <string_view>
49 // We create the yyParser on the stack.
50 #define Parse_ENGINEALWAYSONSTACK
54 using namespace Xapian;
56 static constexpr unsigned NO_EDIT_DISTANCE = unsigned(-1);
57 static constexpr unsigned DEFAULT_EDIT_DISTANCE = 2;
60 U_isupper(unsigned ch) {
61 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
65 U_isdigit(unsigned ch) {
66 return ch < 128 && C_isdigit(static_cast<unsigned char>(ch));
70 U_isalpha(unsigned ch) {
71 return ch < 128 && C_isalpha(static_cast<unsigned char>(ch));
74 using Xapian::Unicode::is_whitespace;
77 is_not_whitespace(unsigned ch) {
78 return !is_whitespace(ch);
81 using Xapian::Unicode::is_wordchar;
84 is_not_wordchar(unsigned ch) {
85 return !is_wordchar(ch);
89 is_digit(unsigned ch) {
90 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
93 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
94 // and there's the risk of hyphens getting stuck onto the end of terms...
96 // There are currently assumptions below that this only matches ASCII
99 is_suffix(unsigned ch) {
100 return ch == '+' || ch == '#';
104 is_double_quote(unsigned ch) {
105 // We simply treat all double quotes as equivalent, which is a bit crude,
106 // but it isn't clear that it would actually better to require them to
109 // 0x201c is Unicode opening double quote.
110 // 0x201d is Unicode closing double quote.
111 return ch == '"' || ch == 0x201c || ch == 0x201d;
115 prefix_needs_colon(const string & prefix, unsigned ch)
117 if (!U_isupper(ch) && ch != ':') return false;
118 string::size_type len = prefix.length();
119 return (len > 1 && prefix[len - 1] != ':');
122 using Unicode::is_currency;
125 is_positional(Xapian::Query::op op)
127 return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
132 /** Class used to pass information about a token from lexer to parser.
134 * Generally an instance of this class carries term information, but it can be
135 * used for a range query, and with some operators (e.g. the distance in
136 * NEAR/3 or ADJ/3, etc).
143 const FieldInfo * field_info;
145 QueryParser::stem_strategy stem;
148 unsigned edit_distance;
150 Term(const string &name_, termpos pos_)
151 : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
152 explicit Term(const string &name_)
153 : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
154 Term(const string &name_, const FieldInfo * field_info_)
155 : name(name_), field_info(field_info_),
156 stem(QueryParser::STEM_NONE), pos(0) { }
157 explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
158 Term(State * state_, const string &name_, const FieldInfo * field_info_,
159 const string &unstemmed_,
160 QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
162 unsigned edit_distance_ = NO_EDIT_DISTANCE)
163 : state(state_), name(name_), field_info(field_info_),
164 unstemmed(unstemmed_), stem(stem_), pos(pos_),
165 edit_distance(edit_distance_) { }
167 Term(const Xapian::Query & q, const string & grouping)
168 : name(grouping), query(q) { }
170 string make_term(const string & prefix) const;
172 void need_positions() {
173 if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
176 termpos get_termpos() const { return pos; }
178 string get_grouping() const {
179 return field_info->grouping;
182 Query * as_fuzzy_query(State * state) const;
184 Query * as_wildcarded_query(State * state) const;
186 /** Build a query for a term at the very end of the query string when
187 * FLAG_PARTIAL is in use.
189 * This query should match documents containing any terms which start with
190 * the characters specified, but should give a higher score to exact
191 * matches (since the user might have finished typing - we simply don't
194 Query * as_partial_query(State * state_) const;
196 /** Build a query for a string of words without explicit word breaks. */
197 Query* as_unbroken_query() const;
199 /** Handle text without explicit word breaks in a positional context. */
200 void as_positional_unbroken(Terms* terms) const;
203 Query as_range_query() const;
205 Query get_query() const;
207 Query get_query_with_synonyms() const;
209 Query get_query_with_auto_synonyms() const;
212 /// Parser State shared between the lexer and the parser.
214 QueryParser::Internal * qpi;
218 const char* error = NULL;
220 Query::op effective_default_op;
222 State(QueryParser::Internal * qpi_, unsigned flags_)
223 : qpi(qpi_), flags(flags_), effective_default_op(qpi_->default_op)
225 if ((flags & QueryParser::FLAG_NO_POSITIONS)) {
226 if (is_positional(effective_default_op)) {
227 effective_default_op = Query::OP_AND;
232 string stem_term(const string &term) {
233 return qpi->stemmer(term);
236 void add_to_stoplist(const Term * term) {
237 qpi->stoplist.push_back(term->name);
240 void add_to_unstem(const string & term, const string & unstemmed) {
241 qpi->unstem.insert(make_pair(term, unstemmed));
244 Term * range(const string &a, const string &b) {
245 for (auto i : qpi->rangeprocs) {
246 Xapian::Query range_query = (i.proc)->check_range(a, b);
247 Xapian::Query::op op = range_query.get_type();
249 case Xapian::Query::OP_INVALID:
251 case Xapian::Query::OP_VALUE_RANGE:
252 case Xapian::Query::OP_VALUE_GE:
253 case Xapian::Query::OP_VALUE_LE:
254 if (i.default_grouping) {
255 Xapian::Internal::QueryValueBase * base =
256 static_cast<Xapian::Internal::QueryValueBase*>(
257 range_query.internal.get());
258 Xapian::valueno slot = base->get_slot();
259 return new Term(range_query, str(slot));
262 case Xapian::Query::LEAF_TERM:
263 return new Term(range_query, i.grouping);
265 return new Term(range_query, string());
271 Query::op default_op() const {
272 return effective_default_op;
275 bool is_stopword(const Term *term) const {
276 return qpi->stopper && (*qpi->stopper)(term->name);
279 Database get_database() const {
283 const Stopper * get_stopper() const {
284 return qpi->stopper.get();
287 size_t stoplist_size() const {
288 return qpi->stoplist.size();
291 void stoplist_resize(size_t s) {
292 qpi->stoplist.resize(s);
295 Xapian::termcount get_max_wildcard_expansion() const {
296 return qpi->max_wildcard_expansion;
299 int get_max_wildcard_type() const {
300 return qpi->max_wildcard_type;
303 unsigned get_min_wildcard_prefix_len() const {
304 return qpi->min_wildcard_prefix_len;
307 Xapian::termcount get_max_partial_expansion() const {
308 return qpi->max_partial_expansion;
311 int get_max_partial_type() const {
312 return qpi->max_partial_type;
315 unsigned get_min_partial_prefix_len() const {
316 return qpi->min_partial_prefix_len;
319 Xapian::termcount get_max_fuzzy_expansion() const {
320 return qpi->max_fuzzy_expansion;
323 int get_max_fuzzy_type() const {
324 return qpi->max_fuzzy_type;
329 Term::make_term(const string & prefix) const
332 if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
334 if (!prefix.empty()) {
336 if (prefix_needs_colon(prefix, name[0])) term += ':';
338 if (stem != QueryParser::STEM_NONE) {
339 term += state->stem_term(name);
344 if (!unstemmed.empty())
345 state->add_to_unstem(term, unstemmed);
349 // Iterator shim to allow building a synonym query from a TermIterator pair.
350 class SynonymIterator {
351 Xapian::TermIterator i;
355 const Xapian::Query * first;
358 SynonymIterator(const Xapian::TermIterator & i_,
359 Xapian::termpos pos_ = 0,
360 const Xapian::Query * first_ = NULL)
361 : i(i_), pos(pos_), first(first_) { }
363 SynonymIterator & operator++() {
371 const Xapian::Query operator*() const {
372 if (first) return *first;
373 return Xapian::Query(*i, 1, pos);
376 bool operator==(const SynonymIterator & o) const {
377 return i == o.i && first == o.first;
380 bool operator!=(const SynonymIterator & o) const {
381 return !(*this == o);
384 typedef std::input_iterator_tag iterator_category;
385 typedef Xapian::Query value_type;
386 typedef Xapian::termcount_diff difference_type;
387 typedef Xapian::Query * pointer;
388 typedef Xapian::Query & reference;
392 Term::get_query_with_synonyms() const
394 // Handle single-word synonyms with each prefix.
395 const auto& prefixes = field_info->prefixes;
396 if (prefixes.empty()) {
397 Assert(field_info->proc);
398 return (*field_info->proc)(name);
401 Query q = get_query();
403 for (auto&& prefix : prefixes) {
404 // First try the unstemmed term:
406 if (!prefix.empty()) {
408 if (prefix_needs_colon(prefix, name[0])) term += ':';
412 Xapian::Database db = state->get_database();
413 Xapian::TermIterator syn = db.synonyms_begin(term);
414 Xapian::TermIterator end = db.synonyms_end(term);
415 if (syn == end && stem != QueryParser::STEM_NONE) {
416 // If that has no synonyms, try the stemmed form:
418 if (!prefix.empty()) {
420 if (prefix_needs_colon(prefix, name[0])) term += ':';
422 term += state->stem_term(name);
423 syn = db.synonyms_begin(term);
424 end = db.synonyms_end(term);
426 q = Query(q.OP_SYNONYM,
427 SynonymIterator(syn, pos, &q),
428 SynonymIterator(end));
434 Term::get_query_with_auto_synonyms() const
436 const unsigned MASK_ENABLE_AUTO_SYNONYMS =
437 QueryParser::FLAG_AUTO_SYNONYMS |
438 QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
439 if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
440 return get_query_with_synonyms();
446 add_to_query(Query *& q, Query::op op, Query * term)
450 if (op == Query::OP_OR) {
452 } else if (op == Query::OP_AND) {
455 *q = Query(op, *q, *term);
464 add_to_query(Query *& q, Query::op op, const Query & term)
467 if (op == Query::OP_OR) {
469 } else if (op == Query::OP_AND) {
472 *q = Query(op, *q, term);
480 Term::get_query() const
482 const auto& prefixes = field_info->prefixes;
483 if (prefixes.empty()) {
484 Assert(field_info->proc);
485 return (*field_info->proc)(name);
487 auto piter = prefixes.begin();
488 Query q(make_term(*piter), 1, pos);
489 while (++piter != prefixes.end()) {
490 q |= Query(make_term(*piter), 1, pos);
496 Term::as_fuzzy_query(State* state_) const
498 const auto& prefixes = field_info->prefixes;
499 Xapian::termcount max = state_->get_max_fuzzy_expansion();
500 int query_flags = state_->get_max_fuzzy_type();
502 subqs.reserve(prefixes.size());
503 for (auto&& prefix : prefixes) {
504 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
505 subqs.emplace_back(Query::OP_EDIT_DISTANCE,
513 Query* q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
519 Term::as_wildcarded_query(State * state_) const
521 const auto& prefixes = field_info->prefixes;
522 Xapian::termcount max = state_->get_max_wildcard_expansion();
523 int query_flags = state_->get_max_wildcard_type();
524 if (state_->flags & QueryParser::FLAG_WILDCARD_SINGLE)
525 query_flags |= Query::WILDCARD_PATTERN_SINGLE;
526 if (state_->flags & QueryParser::FLAG_WILDCARD_MULTI)
527 query_flags |= Query::WILDCARD_PATTERN_MULTI;
529 subqs.reserve(prefixes.size());
530 for (string root : prefixes) {
532 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
533 subqs.push_back(Query(Query::OP_WILDCARD, root, max, query_flags,
536 Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
542 Term::as_partial_query(State * state_) const
544 Xapian::termcount max = state_->get_max_partial_expansion();
545 int max_type = state_->get_max_partial_type();
546 vector<Query> subqs_partial; // A synonym of all the partial terms.
547 vector<Query> subqs_full; // A synonym of all the full terms.
549 for (const string& prefix : field_info->prefixes) {
550 string root = prefix;
552 // Combine with OP_OR, and apply OP_SYNONYM afterwards.
553 subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
555 // Add the term, as it would normally be handled, as an alternative.
556 subqs_full.push_back(Query(make_term(prefix), 1, pos));
558 Query * q = new Query(Query::OP_OR,
559 Query(Query::OP_SYNONYM,
560 subqs_partial.begin(), subqs_partial.end()),
561 Query(Query::OP_SYNONYM,
562 subqs_full.begin(), subqs_full.end()));
568 Term::as_unbroken_query() const
570 const auto& prefixes = field_info->prefixes;
572 vector<Query> prefix_subqs;
575 if (state->flags & QueryParser::FLAG_WORD_BREAKS) {
576 for (WordIterator tk(name); tk != WordIterator(); ++tk) {
577 const string& token = *tk;
578 for (const string& prefix : prefixes) {
579 prefix_subqs.push_back(Query(prefix + token, 1, pos));
583 q = new Query(Query::OP_AND, prefix_subqs.begin(), prefix_subqs.end());
590 vector<Query> ngram_subqs;
592 for (const string& prefix : prefixes) {
593 for (NgramIterator tk(name); tk != NgramIterator(); ++tk) {
594 ngram_subqs.push_back(Query(prefix + *tk, 1, pos));
596 prefix_subqs.push_back(Query(Query::OP_AND,
597 ngram_subqs.begin(), ngram_subqs.end()));
600 q = new Query(Query::OP_OR, prefix_subqs.begin(), prefix_subqs.end());
607 Term::as_range_query() const
615 is_phrase_generator(unsigned ch)
617 // These characters generate a phrase search.
618 // Ordered mostly by frequency of calls to this function done when
619 // running the testcases in api_queryparser.cc.
620 return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
624 is_stem_preventer(unsigned ch)
626 return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
630 should_stem(const string & term)
632 const unsigned int SHOULD_STEM_MASK =
633 (1 << Unicode::LOWERCASE_LETTER) |
634 (1 << Unicode::TITLECASE_LETTER) |
635 (1 << Unicode::MODIFIER_LETTER) |
636 (1 << Unicode::OTHER_LETTER);
637 Utf8Iterator u(term);
638 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
641 /** Value representing "ignore this" when returned by check_infix() or
642 * check_infix_digit().
644 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
646 inline unsigned check_infix(unsigned ch) {
647 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
648 // Unicode includes all these except '&' in its word boundary rules,
649 // as well as 0x2019 (which we handle below) and ':' (for Swedish
650 // apparently, but we ignore this for now as it's problematic in
651 // real world cases).
655 // 0x2019 is Unicode apostrophe and single closing quote.
656 // 0x201b is Unicode single opening quote with the tail rising.
657 if (ch == 0x2019 || ch == 0x201b)
659 if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
660 return UNICODE_IGNORE;
665 inline unsigned check_infix_digit(unsigned ch) {
666 // This list of characters comes from Unicode's word identifying algorithm.
671 case 0x037e: // GREEK QUESTION MARK
672 case 0x0589: // ARMENIAN FULL STOP
673 case 0x060D: // ARABIC DATE SEPARATOR
674 case 0x07F8: // NKO COMMA
675 case 0x2044: // FRACTION SLASH
676 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
677 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
678 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
681 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
682 return UNICODE_IGNORE;
686 // Prototype a function lemon generates, but which we want to call before that
687 // in the generated source code file.
689 static void yy_parse_failed(yyParser *);
692 QueryParser::Internal::add_prefix(string_view field, string_view prefix)
694 // Allow for optional trailing `:` for consistency with how range prefixes
696 if (!field.empty() && field.back() == ':') {
697 field = field.substr(0, field.size() - 1);
699 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
700 auto [it, inserted] = field_map.try_emplace(field, NON_BOOLEAN);
702 auto [it, inserted] = field_map.try_emplace(string(field), NON_BOOLEAN);
704 auto&& p = it->second;
710 // Check that this is the same type of filter as the existing one(s).
711 if (p.type != NON_BOOLEAN) {
712 throw Xapian::InvalidOperationError("Can't use add_prefix() and "
713 "add_boolean_prefix() on the "
714 "same field name, or "
715 "add_boolean_prefix() with "
716 "different values of the "
717 "'exclusive' parameter");
720 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
721 "and string prefixes currently "
723 // Only add if it's not already there as duplicate entries just result
724 // in redundant query terms. This is a linear scan so makes calling
725 // add_prefix() n times for the same field value with different prefix
726 // values O(n²), but you wouldn't realistically want to map one field
727 // to more than a handful of prefixes.
728 auto& prefixes = p.prefixes;
729 if (find(prefixes.begin(), prefixes.end(), prefix) == prefixes.end()) {
735 QueryParser::Internal::add_prefix(string_view field, FieldProcessor* proc)
737 // Allow for optional trailing `:` for consistency with how range prefixes
739 if (!field.empty() && field.back() == ':') {
740 field = field.substr(0, field.size() - 1);
742 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
743 auto [it, inserted] = field_map.try_emplace(field,
746 auto [it, inserted] = field_map.try_emplace(string(field),
752 auto&& p = it->second;
753 // Check that this is the same type of filter as the existing one(s).
754 if (p.type != NON_BOOLEAN) {
755 throw Xapian::InvalidOperationError("Can't use add_prefix() and "
756 "add_boolean_prefix() on the "
757 "same field name, or "
758 "add_boolean_prefix() with "
759 "different values of the "
760 "'exclusive' parameter");
762 if (!p.prefixes.empty())
763 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
764 "and string prefixes currently "
766 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects "
767 "for the same prefix currently not "
772 QueryParser::Internal::add_boolean_prefix(string_view field,
774 const string* grouping)
776 // Allow for optional trailing `:` for consistency with how range prefixes
778 if (!field.empty() && field.back() == ':') {
779 field = field.substr(0, field.size() - 1);
781 // Don't allow the empty prefix to be set as boolean as it doesn't
782 // really make sense.
784 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
785 // If grouping == nullptr then it defaults to field which is not empty().
786 bool inclusive = grouping && grouping->empty();
787 filter_type type = inclusive ? BOOLEAN : BOOLEAN_EXCLUSIVE;
788 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
789 auto [it, inserted] = field_map.try_emplace(field, type,
790 grouping ? *grouping : field);
792 auto [it, inserted] = field_map.try_emplace(string(field), type,
793 grouping ? *grouping : field);
795 auto&& p = it->second;
801 // Check that this is the same type of filter as the existing one(s).
802 if (p.type != type) {
803 throw Xapian::InvalidOperationError("Can't use add_prefix() and "
804 "add_boolean_prefix() on the "
805 "same field name, or "
806 "add_boolean_prefix() with "
807 "different values of the "
808 "'exclusive' parameter"); // FIXME
811 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
812 "and string prefixes currently "
814 // Only add if it's not already there as duplicate entries just result
815 // in redundant query terms. This is a linear scan so makes calling
816 // add_prefix() n times for the same field value with different prefix
817 // values O(n²), but you wouldn't realistically want to map one field
818 // to more than a handful of prefixes.
819 auto& prefixes = p.prefixes;
820 if (find(prefixes.begin(), prefixes.end(), prefix) == prefixes.end()) {
821 prefixes.emplace_back(prefix); // FIXME grouping
826 QueryParser::Internal::add_boolean_prefix(string_view field,
827 FieldProcessor *proc,
828 const string* grouping)
830 // Allow for optional trailing `:` for consistency with how range prefixes
832 if (!field.empty() && field.back() == ':') {
833 field = field.substr(0, field.size() - 1);
835 // Don't allow the empty prefix to be set as boolean as it doesn't
836 // really make sense.
838 throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
839 // If grouping == nullptr then it defaults to field which is not empty().
840 bool inclusive = grouping && grouping->empty();
841 filter_type type = inclusive ? BOOLEAN : BOOLEAN_EXCLUSIVE;
842 #ifdef __cpp_lib_associative_heterogeneous_insertion // C++26
843 auto [it, inserted] = field_map.try_emplace(field, type, proc,
844 grouping ? *grouping : field);
846 auto [it, inserted] = field_map.try_emplace(string(field), type, proc,
847 grouping ? *grouping : field);
852 auto&& p = it->second;
853 // Check that this is the same type of filter as the existing one(s).
854 if (p.type != type) {
855 throw Xapian::InvalidOperationError("Can't use add_prefix() and "
856 "add_boolean_prefix() on the "
857 "same field name, or "
858 "add_boolean_prefix() with "
859 "different values of the "
860 "'exclusive' parameter"); // FIXME
862 if (!p.prefixes.empty())
863 throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects "
864 "and string prefixes currently "
866 throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects "
867 "for the same prefix currently not "
872 is_extended_wildcard(unsigned ch, unsigned flags)
874 if (ch == '*') return (flags & QueryParser::FLAG_WILDCARD_MULTI);
875 if (ch == '?') return (flags & QueryParser::FLAG_WILDCARD_SINGLE);
880 QueryParser::Internal::parse_term(Utf8Iterator& it, const Utf8Iterator& end,
881 bool try_word_break, unsigned flags,
882 bool& needs_word_break, bool& was_acronym,
883 size_t& first_wildcard,
885 unsigned& edit_distance)
889 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
890 // Don't worry if there's a trailing '.' or not.
891 if (U_isupper(*it)) {
895 Unicode::append_utf8(t, *p++);
897 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
898 // One letter does not make an acronym! If we handled a single
899 // uppercase letter here, we wouldn't catch M&S below.
900 if (t.length() > 1) {
901 // Check there's not a (lower case) letter or digit
902 // immediately after it.
903 // FIXME: should I.B.M..P.T.O be a range search?
904 if (p == end || !is_wordchar(*p)) {
912 was_acronym = !term.empty();
914 if (try_word_break && term.empty() && is_unbroken_script(*it)) {
915 const char* start = it.raw();
916 char_count = get_unbroken(it);
917 term.assign(start, it.raw() - start);
918 needs_word_break = true;
922 unsigned prevch = *it;
923 if (first_wildcard == term.npos &&
924 is_extended_wildcard(prevch, flags)) {
928 Unicode::append_utf8(term, prevch);
930 while (++it != end) {
931 if (try_word_break && is_unbroken_script(*it)) break;
933 if (is_extended_wildcard(ch, flags)) {
934 if (first_wildcard == term.npos) {
935 first_wildcard = char_count;
937 } else if (!is_wordchar(ch)) {
938 // Treat a single embedded '&' or "'" or similar as a word
939 // character (e.g. AT&T, Fred's). Also, normalise
940 // apostrophes to ASCII apostrophe.
944 unsigned nextch = *p;
945 if (is_extended_wildcard(nextch, flags)) {
946 // A wildcard follows, which could expand to a digit or a non-digit.
947 unsigned ch_orig = ch;
948 ch = check_infix(ch);
949 if (!ch && is_digit(prevch))
950 ch = check_infix_digit(ch_orig);
954 if (!is_wordchar(nextch)) break;
956 if (is_digit(prevch) && is_digit(nextch)) {
957 ch = check_infix_digit(ch);
959 ch = check_infix(ch);
962 if (ch == UNICODE_IGNORE)
965 Unicode::append_utf8(term, ch);
969 if (it != end && is_suffix(*it)) {
970 string suff_term = term;
972 // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
974 // Assumes is_suffix() only matches ASCII.
975 if (suff_term.size() - term.size() == 3) {
980 } while (is_suffix(*++p));
981 if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
982 // If the suffixed term doesn't exist, check that the
983 // non-suffixed term does. This also takes care of
984 // the case when QueryParser::set_database() hasn't
986 bool use_suff_term = false;
987 string lc = Unicode::tolower(suff_term);
988 if (db.term_exists(lc)) {
989 use_suff_term = true;
991 lc = Unicode::tolower(term);
992 if (!db.term_exists(lc)) use_suff_term = true;
995 // Assumes is_suffix() only matches ASCII.
996 char_count += (suff_term.size() - term.size());
1002 if (first_wildcard == term.npos &&
1003 (flags & QueryParser::FLAG_WILDCARD)) {
1004 // Check for right-truncation.
1005 if (it != end && *it == '*') {
1007 first_wildcard = char_count;
1011 (flags & QueryParser::FLAG_FUZZY) &&
1013 first_wildcard == string::npos &&
1015 Utf8Iterator p = it;
1018 if (p == end || is_whitespace(ch) || ch == ')') {
1020 edit_distance = DEFAULT_EDIT_DISTANCE;
1021 } else if (U_isdigit(ch)) {
1022 unsigned distance = ch - '0';
1023 while (++p != end && U_isdigit(*p)) {
1024 distance = distance * 10 + (*p - '0');
1026 if (p != end && *p == '.') {
1027 if (distance == 0) goto fractional;
1028 // Ignore the fractional part on e.g. foo~12.5
1029 while (++p != end && U_isdigit(*p)) { }
1031 if (p == end || is_whitespace(ch) || ch == ')') {
1033 edit_distance = distance;
1035 } else if (ch == '.') {
1037 double fraction = 0.0;
1039 while (++p != end && U_isdigit(*p)) {
1040 fraction += digit * (*p - '0');
1043 if (p == end || is_whitespace(ch) || ch == ')') {
1045 unsigned codepoints = 0;
1046 for (Utf8Iterator u8(term); u8 != Utf8Iterator(); ++u8) {
1049 edit_distance = unsigned(codepoints * fraction);
1058 // Switch to %code to insert at the end of the file so struct yyParser has been
1063 QueryParser::Internal::parse_query(string_view qs, unsigned flags,
1064 const string& default_prefix)
1067 // Overall it seems best to check for this up front - otherwise we create
1068 // the unhelpful situation where a failure to enable ICU in the build could
1069 // be missed because queries in scripts which don't need word splitting
1071 if (flags & FLAG_WORD_BREAKS) {
1072 throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
1073 "building Xapian to use ICU");
1076 bool try_word_break =
1077 (flags & (FLAG_NGRAMS|FLAG_WORD_BREAKS)) || is_ngram_enabled();
1079 // Set ranges if we may have to handle ranges in the query.
1080 bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
1082 termpos term_pos = 1;
1083 Utf8Iterator it(qs), end;
1085 State state(this, flags);
1087 // To successfully apply more than one spelling correction to a query
1088 // string, we must keep track of the offset due to previous corrections.
1089 int correction_offset = 0;
1090 corrected_query.resize(0);
1092 // Stack of prefixes, used for phrases and subexpressions.
1093 list<const FieldInfo *> prefix_stack;
1095 // If default_prefix is specified, use it. Otherwise, use any list
1096 // that has been set for the empty prefix.
1097 const FieldInfo def_pfx = FieldInfo{NON_BOOLEAN}.append(default_prefix);
1099 const FieldInfo * default_field_info = &def_pfx;
1100 if (default_prefix.empty()) {
1101 auto f = field_map.find(string_view{});
1102 if (f != field_map.end()) default_field_info = &(f->second);
1105 // We always have the current prefix on the top of the stack.
1106 prefix_stack.push_back(default_field_info);
1111 unsigned newprev = ' ';
1114 DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
1115 IN_GROUP2, EXPLICIT_SYNONYM
1117 while (it != end && !state.error) {
1118 bool last_was_operator = false;
1119 bool last_was_operator_needing_term = false;
1120 if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
1123 if (it == end) break;
1125 last_was_operator_needing_term = false;
1126 last_was_operator = true;
1129 just_had_operator_needing_term:
1130 last_was_operator_needing_term = true;
1131 last_was_operator = true;
1133 if (mode == IN_PHRASED_TERM) mode = DEFAULT;
1134 if (is_whitespace(*it)) {
1137 it = find_if(it, end, is_not_whitespace);
1138 if (it == end) break;
1142 (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
1143 // Scan forward to see if this could be the "start of range"
1144 // token. Sadly this has O(n²) tendencies, though at least
1145 // "n" is the number of words in a query which is likely to
1146 // remain fairly small. FIXME: can we tokenise more elegantly?
1147 Utf8Iterator it_initial = it;
1148 Utf8Iterator p = it;
1151 if (ch == '.' && *p == '.') {
1154 Unicode::append_utf8(a, *it++);
1156 // Trim off the trailing ".".
1157 a.resize(a.size() - 1);
1159 // Either end of the range can be empty (for an open-ended
1160 // range) but both can't be empty.
1161 if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
1163 // Allow any character except whitespace and ')' in the
1165 while (p != end && *p > ' ' && *p != ')') {
1166 Unicode::append_utf8(b, *p++);
1168 Term * range = state.range(a, b);
1170 state.error = "Unknown range operation";
1171 if (a.find(':', 1) == string::npos) {
1174 // Might be a boolean filter with ".." in. Leave
1175 // state.error in case it isn't.
1179 Parse(&parser, RANGE, range, &state);
1185 // Allow any character except whitespace and '(' in the lower
1187 if (ch <= ' ' || ch == '(') break;
1192 if (!is_wordchar(*it)) {
1193 unsigned prev = newprev;
1194 Utf8Iterator p = it;
1195 unsigned ch = *it++;
1197 // Drop out of IN_GROUP mode.
1198 if (mode == IN_GROUP || mode == IN_GROUP2)
1202 case 0x201c: // Left curly double quote.
1203 case 0x201d: // Right curly double quote.
1205 if (mode == DEFAULT) {
1207 it = find_if(it, end, is_not_whitespace);
1209 // Ignore an unmatched " at the end of the query to
1210 // avoid generating an empty pair of QUOTEs which will
1211 // cause a parse error.
1214 if (is_double_quote(*it)) {
1215 // Ignore empty "" (but only if we're not already
1216 // IN_QUOTES as we don't merge two adjacent quoted
1222 if (flags & QueryParser::FLAG_PHRASE) {
1223 if (ch == '"' && it != end && *it == '"') {
1225 // Handle "" inside a quoted phrase as an escaped " for
1226 // consistency with quoted boolean terms.
1229 Parse(&parser, QUOTE, NULL, &state);
1230 if (mode == DEFAULT) {
1233 // Remove the prefix we pushed for this phrase.
1234 if (mode == IN_PREFIXED_QUOTES)
1235 prefix_stack.pop_back();
1241 case '+': case '-': // Loved or hated term/phrase/subexpression.
1242 // Ignore + or - at the end of the query string.
1243 if (it == end) goto done;
1244 if (prev > ' ' && prev != '(') {
1245 // Or if not after whitespace or an open bracket.
1248 if (is_whitespace(*it) || *it == '+' || *it == '-') {
1249 // Ignore + or - followed by a space, or further + or -.
1250 // Postfix + (such as in C++ and H+) is handled as part of
1251 // the term lexing code in parse_term().
1255 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
1259 } else if (last_was_operator) {
1260 token = HATE_AFTER_AND;
1264 Parse(&parser, token, NULL, &state);
1265 goto just_had_operator_needing_term;
1267 // Need to prevent the term after a LOVE or HATE starting a
1271 case '(': // Bracketed subexpression.
1273 it = find_if(it, end, is_not_whitespace);
1274 // Ignore ( at the end of the query string.
1275 if (it == end) goto done;
1276 if (prev > ' ' && strchr("()+-", prev) == NULL) {
1277 // Or if not after whitespace or a bracket or '+' or '-'.
1285 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1286 prefix_stack.push_back(prefix_stack.back());
1287 Parse(&parser, BRA, NULL, &state);
1291 case ')': // End of bracketed subexpression.
1292 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1293 // Remove the prefix we pushed for the corresponding BRA.
1294 // If brackets are unmatched, it's a syntax error, but
1295 // that's no excuse to SEGV!
1296 if (prefix_stack.size() > 1) prefix_stack.pop_back();
1297 Parse(&parser, KET, NULL, &state);
1301 case '~': // Synonym expansion.
1302 // Ignore at the end of the query string.
1303 if (it == end) goto done;
1304 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1305 if (prev > ' ' && strchr("+-(", prev) == NULL) {
1306 // Or if not after whitespace, +, -, or an open bracket.
1309 if (!is_wordchar(*it) && !is_double_quote(*it)) {
1310 // Ignore if not followed by a word character.
1313 Parse(&parser, SYNONYM, NULL, &state);
1314 mode = EXPLICIT_SYNONYM;
1315 if (!is_double_quote(*it))
1316 goto just_had_operator_needing_term;
1318 // Support ~"foo bar" syntax to explicitly expand
1319 // a multi-word synonym.
1323 it = find_if(it, end, is_not_whitespace);
1325 // Ignore an unmatched " at the end of the query to
1326 // avoid generating an empty pair of QUOTEs which will
1327 // cause a parse error.
1330 if (is_double_quote(*it)) {
1331 // Ignore empty ~"".
1335 Parse(&parser, QUOTE, NULL, &state);
1340 if (flags & FLAG_WILDCARD_MULTI) {
1342 goto leading_wildcard;
1346 if (flags & FLAG_WILDCARD_SINGLE) {
1348 goto leading_wildcard;
1352 // Skip any other characters.
1356 Assert(is_wordchar(*it));
1359 size_t term_start_index = it.raw() - qs.data();
1361 newprev = 'A'; // Any letter will do...
1363 // A term, a prefix, or a boolean operator.
1364 const FieldInfo * field_info = NULL;
1365 if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1366 !field_map.empty()) {
1367 // Check for a fieldname prefix (e.g. title:historical).
1368 Utf8Iterator p = find_if(it, end, is_not_wordchar);
1369 if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1373 Unicode::append_utf8(field, *p++);
1374 auto f = field_map.find(field);
1375 if (f != field_map.end()) {
1376 // Special handling for prefixed fields, depending on the
1377 // type of the prefix.
1379 field_info = &(f->second);
1381 if (field_info->type != NON_BOOLEAN) {
1382 // Drop out of IN_GROUP if we're in it.
1383 if (mode == IN_GROUP || mode == IN_GROUP2)
1387 if (it != end && is_double_quote(*it)) {
1388 // Quoted boolean term (can contain any character).
1389 bool fancy = (*it != '"');
1393 // Interpret "" as an escaped ".
1394 if (++it == end || *it != '"')
1396 } else if (fancy && is_double_quote(*it)) {
1397 // If the opening quote was ASCII, then the
1398 // closing one must be too - otherwise
1399 // the user can't protect non-ASCII double
1400 // quote characters by quoting or escaping.
1404 Unicode::append_utf8(name, *it++);
1407 // Can't boolean filter prefix a subexpression, so
1408 // just use anything following the prefix until the
1409 // next space or ')' as part of the boolean filter
1411 while (it != end && *it > ' ' && *it != ')')
1412 Unicode::append_utf8(name, *it++);
1414 // Build the unstemmed form in field.
1417 // Clear any pending range error.
1419 Term * token = new Term(&state, name, field_info, field);
1420 Parse(&parser, BOOLEAN_FILTER, token, &state);
1424 if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1425 // Prefixed phrase, e.g.: subject:"space flight"
1426 mode = IN_PREFIXED_QUOTES;
1427 Parse(&parser, QUOTE, NULL, &state);
1431 prefix_stack.push_back(field_info);
1435 if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1436 // Prefixed subexpression, e.g.: title:(fast NEAR food)
1438 Parse(&parser, BRA, NULL, &state);
1442 prefix_stack.push_back(field_info);
1447 // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1448 while (is_phrase_generator(ch)) {
1455 if (is_wordchar(ch)) {
1460 // It looks like a prefix but isn't, so parse it as
1470 bool needs_word_break = false;
1471 size_t first_wildcard = string::npos;
1472 size_t term_char_count;
1473 unsigned edit_distance = NO_EDIT_DISTANCE;
1474 string term = parse_term(it, end, try_word_break, flags,
1475 needs_word_break, was_acronym, first_wildcard,
1476 term_char_count, edit_distance);
1478 if (first_wildcard == string::npos &&
1479 edit_distance == NO_EDIT_DISTANCE &&
1480 (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1481 (flags & FLAG_BOOLEAN) &&
1482 // Don't want to interpret A.N.D. as an AND operator.
1485 term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1486 // Boolean operators.
1488 if (flags & FLAG_BOOLEAN_ANY_CASE) {
1489 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1493 if (op.size() == 3) {
1495 Parse(&parser, AND, NULL, &state);
1496 goto just_had_operator;
1499 Parse(&parser, NOT, NULL, &state);
1500 goto just_had_operator;
1503 Parse(&parser, XOR, NULL, &state);
1504 goto just_had_operator;
1507 if (it != end && *it == '/') {
1509 Utf8Iterator p = it;
1510 while (++p != end && U_isdigit(*p)) {
1511 width = (width * 10) + (*p - '0');
1513 if (width && (p == end || is_whitespace(*p))) {
1515 Parse(&parser, ADJ, new Term(width), &state);
1516 goto just_had_operator;
1519 Parse(&parser, ADJ, NULL, &state);
1520 goto just_had_operator;
1524 Parse(&parser, SYN, NULL, &state);
1525 goto just_had_operator;
1527 } else if (op.size() == 2) {
1529 Parse(&parser, OR, NULL, &state);
1530 goto just_had_operator;
1532 } else if (op.size() == 4) {
1534 if (it != end && *it == '/') {
1536 Utf8Iterator p = it;
1537 while (++p != end && U_isdigit(*p)) {
1538 width = (width * 10) + (*p - '0');
1540 if (width && (p == end || is_whitespace(*p))) {
1542 Parse(&parser, NEAR, new Term(width), &state);
1543 goto just_had_operator;
1546 Parse(&parser, NEAR, NULL, &state);
1547 goto just_had_operator;
1553 // If no prefix is set, use the default one.
1554 if (!field_info) field_info = prefix_stack.back();
1556 Assert(field_info->type == NON_BOOLEAN);
1559 string unstemmed_term(term);
1560 term = Unicode::tolower(term);
1562 // Reuse stem_strategy - STEM_SOME here means "stem terms except
1563 // when used with positional operators".
1564 stem_strategy stem_term = stem_action;
1565 if (stem_term != STEM_NONE) {
1566 if (stemmer.is_none()) {
1567 stem_term = STEM_NONE;
1568 } else if (first_wildcard != string::npos ||
1569 edit_distance != NO_EDIT_DISTANCE) {
1570 stem_term = STEM_NONE;
1571 } else if (stem_term == STEM_SOME ||
1572 stem_term == STEM_SOME_FULL_POS) {
1573 if (!should_stem(unstemmed_term) ||
1574 (it != end && is_stem_preventer(*it))) {
1575 // Don't stem this particular term.
1576 stem_term = STEM_NONE;
1581 if (first_wildcard != string::npos) {
1582 if (first_wildcard < state.get_min_wildcard_prefix_len()) {
1583 errmsg = "Too few characters before wildcard";
1588 Term * term_obj = new Term(&state, term, field_info,
1589 unstemmed_term, stem_term, term_pos++,
1592 if (first_wildcard != string::npos ||
1593 edit_distance != NO_EDIT_DISTANCE) {
1594 if (mode == IN_GROUP || mode == IN_GROUP2) {
1595 // Drop out of IN_GROUP and flag that the group
1596 // can be empty if all members are stopwords.
1597 if (mode == IN_GROUP2)
1598 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1602 first_wildcard != string::npos ? WILD_TERM : EDIT_TERM,
1608 if (needs_word_break) {
1609 Parse(&parser, UNBROKEN_WORDS, term_obj, &state);
1610 // Drop out of IN_GROUP mode.
1611 if (mode == IN_GROUP || mode == IN_GROUP2)
1613 if (it == end) break;
1617 if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1618 if (it == end && (flags & FLAG_PARTIAL)) {
1619 auto min_len = state.get_min_partial_prefix_len();
1620 if (term_char_count >= min_len) {
1621 if (mode == IN_GROUP || mode == IN_GROUP2) {
1622 // Drop out of IN_GROUP and flag that the group
1623 // can be empty if all members are stopwords.
1624 if (mode == IN_GROUP2)
1625 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1628 // Final term of a partial match query, with no
1629 // following characters - treat as a wildcard.
1630 Parse(&parser, PARTIAL_TERM, term_obj, &state);
1636 // Check spelling, if we're a normal term, and any of the prefixes
1638 if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1639 const auto& prefixes = field_info->prefixes;
1640 for (const string& prefix : prefixes) {
1641 if (!prefix.empty())
1643 const string & suggest = db.get_spelling_suggestion(term);
1644 if (!suggest.empty()) {
1645 if (corrected_query.empty()) corrected_query = qs;
1646 size_t term_end_index = it.raw() - qs.data();
1647 size_t n = term_end_index - term_start_index;
1648 size_t pos = UNSIGNED_OVERFLOW_OK(term_start_index + correction_offset);
1649 corrected_query.replace(pos, n, suggest);
1650 UNSIGNED_OVERFLOW_OK(correction_offset += suggest.size());
1651 UNSIGNED_OVERFLOW_OK(correction_offset -= n);
1657 if (mode == IN_PHRASED_TERM) {
1658 Parse(&parser, PHR_TERM, term_obj, &state);
1660 // See if the next token will be PHR_TERM - if so, this one
1661 // needs to be TERM not GROUP_TERM.
1662 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1663 is_phrase_generator(*it)) {
1664 // FIXME: can we clean this up?
1665 Utf8Iterator p = it;
1668 } while (p != end && is_phrase_generator(*p));
1669 // Don't generate a phrase unless the phrase generators are
1670 // immediately followed by another term.
1671 if (p != end && is_wordchar(*p)) {
1677 if (mode == IN_GROUP || mode == IN_GROUP2) {
1681 Parse(&parser, token, term_obj, &state);
1682 if (token == TERM && mode != DEFAULT)
1687 if (it == end) break;
1689 if (is_phrase_generator(*it)) {
1690 // Skip multiple phrase generators.
1693 } while (it != end && is_phrase_generator(*it));
1694 // Don't generate a phrase unless the phrase generators are
1695 // immediately followed by another term.
1696 if (it != end && is_wordchar(*it)) {
1697 mode = IN_PHRASED_TERM;
1698 term_start_index = it.raw() - qs.data();
1701 } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1702 int old_mode = mode;
1704 if (!last_was_operator_needing_term && is_whitespace(*it)) {
1706 // Skip multiple whitespace.
1709 } while (it != end && is_whitespace(*it));
1710 // Don't generate a group unless the terms are only separated
1712 if (it != end && is_wordchar(*it)) {
1713 if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1724 // Implicitly close any unclosed quotes.
1725 if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1726 Parse(&parser, QUOTE, NULL, &state);
1728 // Implicitly close all unclosed brackets.
1729 while (prefix_stack.size() > 1) {
1730 Parse(&parser, KET, NULL, &state);
1731 prefix_stack.pop_back();
1733 Parse(&parser, 0, NULL, &state);
1736 errmsg = state.error;
1744 Query* query = NULL;
1747 // filter is a map from prefix to a query for that prefix. Queries with
1748 // the same prefix are combined with OR, and the results of this are
1749 // combined with AND to get the full filter.
1750 map<string, Query> filter;
1755 ProbQuery(Query* query_) : query(query_) {}
1763 void add_filter(const string& grouping, const Query & q) {
1764 filter[grouping] = q;
1767 void append_filter(const string& grouping, const Query & qnew) {
1768 auto it = filter.find(grouping);
1769 if (it == filter.end()) {
1770 filter.insert(make_pair(grouping, qnew));
1772 Query & q = it->second;
1773 // We OR multiple filters with the same prefix if they're
1774 // exclusive, otherwise we AND them.
1775 bool exclusive = !grouping.empty();
1784 void add_filter_range(const string& grouping, const Query & range) {
1785 filter[grouping] = range;
1788 void append_filter_range(const string& grouping, const Query & range) {
1789 Query & q = filter[grouping];
1793 Query merge_filters() const {
1794 auto i = filter.begin();
1795 Assert(i != filter.end());
1796 Query q = i->second;
1797 while (++i != filter.end()) {
1804 /// A group of terms separated only by whitespace.
1806 vector<Term *> terms;
1808 /** Controls how to handle a group where all terms are stopwords.
1810 * If true, then as_group() returns NULL. If false, then the
1811 * stopword status of the terms is ignored.
1815 TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1821 /// Factory function - ensures heap allocation.
1822 static TermGroup* create(Term* t1, Term* t2) {
1823 return new TermGroup(t1, t2);
1827 for (auto&& t : terms) {
1832 /// Add a Term object to this TermGroup object.
1833 void add_term(Term * term) {
1834 terms.push_back(term);
1837 /// Set the empty_ok flag.
1838 void set_empty_ok() { empty_ok = true; }
1840 /// Convert to a Xapian::Query * using default_op.
1841 Query * as_group(State *state) const;
1845 TermGroup::as_group(State *state) const
1847 const Xapian::Stopper * stopper = state->get_stopper();
1848 size_t stoplist_size = state->stoplist_size();
1849 bool default_op_is_positional = is_positional(state->default_op());
1851 Query::op default_op = state->default_op();
1852 vector<Query> subqs;
1853 subqs.reserve(terms.size());
1854 if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1855 // Check for multi-word synonyms.
1856 Database db = state->get_database();
1859 vector<Term*>::size_type begin = 0;
1860 vector<Term*>::size_type i = begin;
1861 while (terms.size() - i > 0) {
1862 size_t longest_match = 0;
1863 // This value is never used, but GCC 4.8 warns with
1864 // -Wmaybe-uninitialized (GCC 5.4 doesn't).
1865 vector<Term*>::size_type longest_match_end = 0;
1866 if (terms.size() - i >= 2) {
1867 // Greedily try to match as many consecutive words as possible.
1868 key = terms[i]->name;
1870 key += terms[i + 1]->name;
1871 TermIterator synkey(db.synonym_keys_begin(key));
1872 TermIterator synend(db.synonym_keys_end(key));
1873 if (synkey != synend) {
1874 longest_match = key.size();
1875 longest_match_end = i + 2;
1876 for (auto j = i + 2; j < terms.size(); ++j) {
1878 key += terms[j]->name;
1879 synkey.skip_to(key);
1880 if (synkey == synend)
1882 const string& found = *synkey;
1883 if (!startswith(found, key))
1885 if (found.size() == key.size()) {
1886 longest_match = key.size();
1887 longest_match_end = j + 1;
1892 if (longest_match == 0) {
1893 // No multi-synonym matches at position i.
1894 if (stopper && (*stopper)(terms[i]->name)) {
1895 state->add_to_stoplist(terms[i]);
1897 if (default_op_is_positional)
1898 terms[i]->need_positions();
1899 subqs.push_back(terms[i]->get_query_with_auto_synonyms());
1904 i = longest_match_end;
1905 key.resize(longest_match);
1907 vector<Query> subqs2;
1908 for (auto j = begin; j != i; ++j) {
1909 if (stopper && (*stopper)(terms[j]->name)) {
1910 state->add_to_stoplist(terms[j]);
1912 if (default_op_is_positional)
1913 terms[i]->need_positions();
1914 subqs2.push_back(terms[j]->get_query());
1917 Query q_original_terms;
1918 if (default_op_is_positional) {
1919 q_original_terms = Query(default_op,
1920 subqs2.begin(), subqs2.end(),
1923 q_original_terms = Query(default_op,
1924 subqs2.begin(), subqs2.end());
1928 // Use the position of the first term for the synonyms.
1929 TermIterator syn = db.synonyms_begin(key);
1930 Query q(Query::OP_SYNONYM,
1931 SynonymIterator(syn, terms[begin]->pos, &q_original_terms),
1932 SynonymIterator(db.synonyms_end(key)));
1938 vector<Term*>::const_iterator i;
1939 for (i = terms.begin(); i != terms.end(); ++i) {
1940 if (stopper && (*stopper)((*i)->name)) {
1941 state->add_to_stoplist(*i);
1943 if (default_op_is_positional)
1944 (*i)->need_positions();
1945 subqs.push_back((*i)->get_query_with_auto_synonyms());
1950 if (!empty_ok && stopper && subqs.empty() &&
1951 stoplist_size < state->stoplist_size()) {
1952 // This group is all stopwords, so roll-back, disable stopper
1953 // temporarily, and reprocess this group.
1954 state->stoplist_resize(stoplist_size);
1960 if (!subqs.empty()) {
1961 if (default_op_is_positional) {
1962 q = new Query(default_op, subqs.begin(), subqs.end(),
1965 q = new Query(default_op, subqs.begin(), subqs.end());
1972 /// Some terms which form a positional sub-query.
1974 vector<Term *> terms;
1978 * size_t(-1) means don't use positional info (so an OP_AND query gets
1983 /** Keep track of whether the terms added all have the same list of
1984 * prefixes. If so, we'll build a set of phrases, one using each prefix.
1985 * This works around the limitation that a phrase cannot have multiple
1986 * components which are "OR" combinations of terms, but is also probably
1987 * what users expect: i.e., if a user specifies a phrase in a field, and
1988 * that field maps to multiple prefixes, the user probably wants a phrase
1989 * returned with all terms having one of those prefixes, rather than a
1990 * phrase comprised of terms with differing prefixes.
1992 bool uniform_prefixes;
1994 /** The list of prefixes of the terms added.
1995 * This will be NULL if the terms have different prefixes.
1997 const vector<string>* prefixes;
1999 Query opwindow_subq(Query::op op,
2000 const vector<Query>& v,
2001 Xapian::termcount w) const {
2002 if (op == Query::OP_AND) {
2003 return Query(op, v.begin(), v.end());
2005 return Query(op, v.begin(), v.end(), w);
2008 /// Convert to a query using the given operator and window size.
2009 Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
2010 if (window == size_t(-1)) op = Query::OP_AND;
2012 size_t n_terms = terms.size();
2013 Xapian::termcount w = w_delta + terms.size();
2014 if (uniform_prefixes) {
2016 for (auto&& prefix : *prefixes) {
2017 vector<Query> subqs;
2018 subqs.reserve(n_terms);
2019 for (Term* t : terms) {
2020 subqs.push_back(Query(t->make_term(prefix), 1, t->pos));
2022 add_to_query(q, Query::OP_OR, opwindow_subq(op, subqs, w));
2026 vector<Query> subqs;
2027 subqs.reserve(n_terms);
2028 for (Term* t : terms) {
2029 subqs.push_back(t->get_query());
2031 q = new Query(opwindow_subq(op, subqs, w));
2038 explicit Terms(bool no_pos)
2039 : window(no_pos ? size_t(-1) : 0),
2040 uniform_prefixes(true),
2044 /// Factory function - ensures heap allocation.
2045 static Terms* create(State* state) {
2046 return new Terms(state->flags & QueryParser::FLAG_NO_POSITIONS);
2050 for (auto&& t : terms) {
2055 /// Add an unstemmed Term object to this Terms object.
2056 void add_positional_term(Term * term) {
2057 const auto& term_prefixes = term->field_info->prefixes;
2058 if (terms.empty()) {
2059 prefixes = &term_prefixes;
2060 } else if (uniform_prefixes && prefixes != &term_prefixes) {
2061 if (*prefixes != term_prefixes) {
2063 uniform_prefixes = false;
2066 term->need_positions();
2067 terms.push_back(term);
2070 void adjust_window(size_t alternative_window) {
2071 if (alternative_window > window) window = alternative_window;
2074 /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
2075 Query * as_phrase_query() const {
2076 return as_opwindow_query(Query::OP_PHRASE, 0);
2079 /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
2080 Query* as_synonym_phrase_query(State* state) const {
2083 for (Term* t : terms) {
2084 if (!name.empty()) {
2092 for (auto&& prefix : *prefixes) {
2093 // Only try unstemmed for multi-word.
2095 if (!prefix.empty()) {
2097 if (prefix_needs_colon(prefix, name[0])) term += ':';
2101 Xapian::Database db = state->get_database();
2102 Xapian::TermIterator syn = db.synonyms_begin(term);
2103 Xapian::TermIterator end = db.synonyms_end(term);
2105 // Caution: this does `delete this;`!
2106 Query* q = as_opwindow_query(Query::OP_PHRASE, 0);
2107 // FIXME: Is this right when there's more than one entry in
2109 Query* q2 = new Query(q->OP_SYNONYM,
2110 SynonymIterator(syn, pos, q),
2111 SynonymIterator(end));
2114 // FIXME: Handle multiple prefixes properly...
2119 /// Convert to a Xapian::Query * using OP_NEAR.
2120 Query * as_near_query() const {
2121 // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
2122 // means a window size of 11. For more than 2 terms, we just add one
2123 // to the window size for each extra term.
2126 return as_opwindow_query(Query::OP_NEAR, w - 1);
2129 /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
2130 Query * as_adj_query() const {
2131 // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
2132 // which means a window size of 11. For more than 2 terms, we just add
2133 // one to the window size for each extra term.
2136 return as_opwindow_query(Query::OP_PHRASE, w - 1);
2141 Term::as_positional_unbroken(Terms* terms) const
2144 if (state->flags & QueryParser::FLAG_WORD_BREAKS) {
2145 for (WordIterator tk(name); tk != WordIterator(); ++tk) {
2146 const string& t = *tk;
2147 Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
2148 terms->add_positional_term(c);
2154 // Add each individual character to the phrase.
2156 for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
2157 Unicode::append_utf8(t, *it);
2158 Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
2159 terms->add_positional_term(c);
2163 // FIXME: we want to add the n-grams as filters too for efficiency.
2168 // Helper macro to check for missing arguments to a boolean operator.
2169 #define VET_BOOL_ARGS(A, B, OP_TXT) \
2172 state->error = "Syntax: <expression> " OP_TXT " <expression>";\
2173 yy_parse_failed(yypParser);\
2180 %token_type {Term *}
2181 %token_destructor { delete $$; }
2183 %extra_argument {State * state}
2186 // If we've not already set an error message, set a default one.
2187 if (!state->error) state->error = "parse error";
2191 yy_parse_failed(yypParser);
2194 // Operators, grouped in order of increasing precedence:
2200 %left LOVE HATE HATE_AFTER_AND SYNONYM.
2203 // Destructors for terminal symbols:
2205 // TERM is a query term, including prefix (if any).
2206 %destructor TERM { delete $$; }
2208 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
2209 // is only separated by whitespace characters.
2210 %destructor GROUP_TERM { delete $$; }
2212 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
2213 // separated only by one or more phrase generator characters (hyphen and
2214 // apostrophe are common examples - see is_phrase_generator() for the list
2215 // of all punctuation which does this).
2216 %destructor PHR_TERM { delete $$; }
2218 // EDIT_TERM is like a TERM, but with an edit distance.
2219 %destructor EDIT_TERM { delete $$; }
2221 // WILD_TERM is like a TERM, but with wildcards which needs to be expanded.
2222 %destructor WILD_TERM { delete $$; }
2224 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
2225 // we're doing "search as you type". It expands to something like:
2226 // WILD_TERM($$+"*") OR stemmed_form
2227 %destructor PARTIAL_TERM { delete $$; }
2229 // BOOLEAN_FILTER is a query term with a prefix registered using
2230 // add_boolean_prefix(). It's added to the query using an OP_FILTER operator,
2231 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
2232 %destructor BOOLEAN_FILTER { delete $$; }
2236 // query - The whole query - just an expr or nothing.
2238 // query non-terminal doesn't need a type, so just give a dummy one.
2241 query ::= expr(E). {
2242 // Save the parsed query in the State structure so we can return it.
2247 state->query = Query();
2252 // Handle a query string with no terms in.
2253 state->query = Query();
2256 // expr - A query expression.
2258 %type expr {Query *}
2259 %destructor expr { delete $$; }
2261 expr(E) ::= prob_expr(E).
2263 expr(A) ::= bool_arg(A) AND bool_arg(B). {
2264 VET_BOOL_ARGS(A, B, "AND");
2269 expr(A) ::= bool_arg(A) NOT bool_arg(B). {
2270 if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
2271 // 'NOT foo' -> '(0 * <alldocuments>) NOT foo'
2273 // We scale the <alldocuments> by 0 so it doesn't count towards the
2274 // number of matching subqueries since that allows the query optimiser
2275 // to eliminate it if other subqueries are combined in an AND-like
2276 // way (e.g. 'bar AND (NOT foo)').
2277 A = new Query(0.0, Query(string(), 1, 0));
2279 VET_BOOL_ARGS(A, B, "NOT");
2284 expr(A) ::= bool_arg(A) AND NOT bool_arg(B). [NOT] {
2285 VET_BOOL_ARGS(A, B, "AND NOT");
2290 expr(A) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND] {
2291 VET_BOOL_ARGS(A, B, "AND");
2296 expr(A) ::= bool_arg(A) OR bool_arg(B). {
2297 VET_BOOL_ARGS(A, B, "OR");
2302 expr(A) ::= bool_arg(A) XOR bool_arg(B). {
2303 VET_BOOL_ARGS(A, B, "XOR");
2308 expr(A) ::= bool_arg(A) SYN bool_arg(B). {
2309 VET_BOOL_ARGS(A, B, "SYN");
2310 *A = Query(Query::OP_SYNONYM, *A, *B);
2314 // bool_arg - an argument to a boolean operator such as AND or OR.
2316 %type bool_arg {Query *}
2317 %destructor bool_arg { delete $$; }
2319 bool_arg(A) ::= expr(A).
2321 bool_arg(A) ::= . [ERROR] {
2322 // Set the argument to NULL, which enables the bool_arg-using rules in
2323 // expr above to report uses of AND, OR, etc which don't have two
2328 // prob_expr - a single compound term, or a prob.
2330 %type prob_expr {Query *}
2331 %destructor prob_expr { delete $$; }
2333 prob_expr(E) ::= prob(P). {
2336 // Handle any "+ terms".
2338 if (P->love->empty()) {
2344 add_to_query(E, Query::OP_AND_MAYBE, P->love);
2350 // Handle any boolean filters.
2351 if (!P->filter.empty()) {
2353 add_to_query(E, Query::OP_FILTER, P->merge_filters());
2355 // Make the query a boolean one.
2356 E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
2359 // Handle any "- terms".
2360 if (P->hate && !P->hate->empty()) {
2363 yy_parse_failed(yypParser);
2366 *E = Query(Query::OP_AND_NOT, *E, *P->hate);
2371 prob_expr(E) ::= term(E).
2373 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
2374 // boolean filters, and/or ranges.
2376 // Note: stop_term can also be several other things other than a simple term!
2378 %type prob {ProbQuery *}
2379 %destructor prob { delete $$; }
2381 prob(P) ::= RANGE(R). {
2382 string grouping = R->name;
2383 const Query & range = R->as_range_query();
2384 P = new ProbQuery; /*P-overwrites-R*/
2385 P->add_filter_range(grouping, range);
2388 prob(P) ::= stop_prob(P) RANGE(R). {
2389 string grouping = R->name;
2390 const Query & range = R->as_range_query();
2391 P->append_filter_range(grouping, range);
2394 prob(P) ::= stop_term(T) stop_term(U). {
2395 P = new ProbQuery(T); /*P-overwrites-T*/
2397 Query::op op = state->default_op();
2398 if (P->query && is_positional(op)) {
2399 // If default_op is OP_NEAR or OP_PHRASE, set the window size to
2400 // 11 for the first pair of terms and it will automatically grow
2401 // by one for each subsequent term.
2402 Query * subqs[2] = { P->query, U };
2403 *(P->query) = Query(op, subqs, subqs + 2, 11);
2406 add_to_query(P->query, op, U);
2411 prob(P) ::= prob(P) stop_term(T). {
2412 // If T is a stopword, there's nothing to do here.
2413 if (T) add_to_query(P->query, state->default_op(), T);
2416 prob(P) ::= LOVE term(T). {
2418 if (state->default_op() == Query::OP_AND) {
2425 prob(P) ::= stop_prob(P) LOVE term(T). {
2426 if (state->default_op() == Query::OP_AND) {
2427 /* The default op is AND, so we just put loved terms into the query
2428 * (in this case the only effect of love is to ignore the stopword
2430 add_to_query(P->query, Query::OP_AND, T);
2432 add_to_query(P->love, Query::OP_AND, T);
2436 prob(P) ::= HATE term(T). {
2441 prob(P) ::= stop_prob(P) HATE term(T). {
2442 add_to_query(P->hate, Query::OP_OR, T);
2445 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2447 P->hate = new Query(T->get_query());
2451 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2452 add_to_query(P->hate, Query::OP_OR, T->get_query());
2456 prob(P) ::= BOOLEAN_FILTER(T). {
2458 P->add_filter(T->get_grouping(), T->get_query());
2462 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2463 P->append_filter(T->get_grouping(), T->get_query());
2467 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2468 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2470 P->filter[T->get_grouping()] = T->get_query();
2474 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2475 // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2476 // We OR filters with the same prefix...
2477 Query & q = P->filter[T->get_grouping()];
2478 q |= T->get_query();
2482 // stop_prob - A prob or a stop_term.
2484 %type stop_prob {ProbQuery *}
2485 %destructor stop_prob { delete $$; }
2487 stop_prob(P) ::= prob(P).
2489 stop_prob(P) ::= stop_term(T). {
2490 P = new ProbQuery(T); /*P-overwrites-T*/
2493 // stop_term - A term which should be checked against the stopword list,
2494 // or a compound_term.
2496 // If a term is loved, hated, or in a phrase, we don't want to consult the
2497 // stopword list, so stop_term isn't used there (instead term is).
2499 %type stop_term {Query *}
2500 %destructor stop_term { delete $$; }
2502 stop_term(T) ::= TERM(U). {
2503 if (state->is_stopword(U)) {
2505 state->add_to_stoplist(U);
2507 T = new Query(U->get_query_with_auto_synonyms());
2512 stop_term(T) ::= compound_term(T).
2514 // term - A term or a compound_term.
2516 %type term {Query *}
2517 %destructor term { delete $$; }
2519 term(T) ::= TERM(U). {
2520 T = new Query(U->get_query_with_auto_synonyms());
2524 term(T) ::= compound_term(T).
2526 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2527 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2528 // or without prefix).
2530 %type compound_term {Query *}
2531 %destructor compound_term { delete $$; }
2533 compound_term(T) ::= EDIT_TERM(U).
2534 { T = U->as_fuzzy_query(state); /*T-overwrites-U*/ }
2536 compound_term(T) ::= WILD_TERM(U).
2537 { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2539 compound_term(T) ::= PARTIAL_TERM(U).
2540 { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2542 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2543 { T = P->as_phrase_query(); }
2545 compound_term(T) ::= phrased_term(P).
2546 { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2548 compound_term(T) ::= group(P).
2549 { T = P->as_group(state); /*T-overwrites-P*/ }
2551 compound_term(T) ::= near_expr(P).
2552 { T = P->as_near_query(); /*T-overwrites-P*/ }
2554 compound_term(T) ::= adj_expr(P).
2555 { T = P->as_adj_query(); /*T-overwrites-P*/ }
2557 compound_term(T) ::= BRA expr(E) KET.
2560 compound_term(T) ::= SYNONYM TERM(U). {
2561 T = new Query(U->get_query_with_synonyms());
2565 compound_term(T) ::= SYNONYM QUOTE phrase(P) QUOTE.
2566 { T = P->as_synonym_phrase_query(state); }
2568 compound_term(T) ::= UNBROKEN_WORDS(U). {
2569 { T = U->as_unbroken_query(); /*T-overwrites-U*/ }
2572 // phrase - The "inside the quotes" part of a double-quoted phrase.
2574 %type phrase {Terms *}
2576 %destructor phrase { delete $$; }
2578 phrase(P) ::= TERM(T). {
2579 P = Terms::create(state);
2580 P->add_positional_term(T);
2583 phrase(P) ::= UNBROKEN_WORDS(T). {
2584 P = Terms::create(state);
2585 T->as_positional_unbroken(P);
2588 phrase(P) ::= phrase(P) TERM(T). {
2589 P->add_positional_term(T);
2592 phrase(P) ::= phrase(P) UNBROKEN_WORDS(T). {
2593 T->as_positional_unbroken(P);
2596 // phrased_term - A phrased term works like a single term, but is actually
2597 // 2 or more terms linked together into a phrase by punctuation. There must be
2598 // at least 2 terms in order to be able to have punctuation between the terms!
2600 %type phrased_term {Terms *}
2601 %destructor phrased_term { delete $$; }
2603 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2604 P = Terms::create(state);
2605 P->add_positional_term(T);
2606 P->add_positional_term(U);
2609 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2610 P->add_positional_term(T);
2613 // group - A group of terms separated only by whitespace - candidates for
2614 // multi-term synonyms.
2616 %type group {TermGroup *}
2617 %destructor group { delete $$; }
2619 group(P) ::= TERM(T) GROUP_TERM(U). {
2620 P = TermGroup::create(T, U); /*P-overwrites-T*/
2623 group(P) ::= group(P) GROUP_TERM(T). {
2627 group(P) ::= group(P) EMPTY_GROUP_OK. {
2631 // near_expr - 2 or more terms with NEAR in between. There must be at least 2
2632 // terms in order for there to be any NEAR operators!
2634 %type near_expr {Terms *}
2635 %destructor near_expr { delete $$; }
2637 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2638 P = Terms::create(state);
2639 P->add_positional_term(T);
2640 P->add_positional_term(U);
2642 P->adjust_window(N->get_termpos());
2647 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2648 P->add_positional_term(T);
2650 P->adjust_window(N->get_termpos());
2655 // adj_expr - 2 or more terms with ADJ in between. There must be at least 2
2656 // terms in order for there to be any ADJ operators!
2658 %type adj_expr {Terms *}
2659 %destructor adj_expr { delete $$; }
2661 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2662 P = Terms::create(state);
2663 P->add_positional_term(T);
2664 P->add_positional_term(U);
2666 P->adjust_window(N->get_termpos());
2671 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2672 P->add_positional_term(T);
2674 P->adjust_window(N->get_termpos());
2679 // Select lemon syntax highlighting in vim editor: vim: syntax=lemon