xapian-core/queryparser/queryparser.lemony

   1 %include {
   2 /** @file
   3  * @brief build a Xapian::Query object from a user query string
   4  */
   5 /* Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016,2018,2019 Olly Betts
   6  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
   7  * Copyright (C) 2010 Adam Sjøgren
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #include "queryparser_internal.h"
  28
  29 #include "api/queryinternal.h"
  30 #include "omassert.h"
  31 #include "str.h"
  32 #include "stringutils.h"
  33 #include "xapian/error.h"
  34 #include "xapian/unicode.h"
  35
  36 // Include the list of token values lemon generates.
  37 #include "queryparser_token.h"
  38
  39 #include "cjk-tokenizer.h"
  40
  41 #include <algorithm>
  42 #include <cstring>
  43 #include <limits>
  44 #include <list>
  45 #include <string>
  46 #include <vector>
  47
  48 // We create the yyParser on the stack.
  49 #define Parse_ENGINEALWAYSONSTACK
  50
  51 using namespace std;
  52
  53 using namespace Xapian;
  54
  55 inline bool
  56 U_isupper(unsigned ch) {
  57     return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
  58 }
  59
  60 inline bool
  61 U_isdigit(unsigned ch) {
  62     return ch < 128 && C_isdigit(static_cast<unsigned char>(ch));
  63 }
  64
  65 inline bool
  66 U_isalpha(unsigned ch) {
  67     return ch < 128 && C_isalpha(static_cast<unsigned char>(ch));
  68 }
  69
  70 using Xapian::Unicode::is_whitespace;
  71
  72 inline bool
  73 is_not_whitespace(unsigned ch) {
  74     return !is_whitespace(ch);
  75 }
  76
  77 using Xapian::Unicode::is_wordchar;
  78
  79 inline bool
  80 is_not_wordchar(unsigned ch) {
  81     return !is_wordchar(ch);
  82 }
  83
  84 inline bool
  85 is_digit(unsigned ch) {
  86     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
  87 }
  88
  89 // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
  90 // and there's the risk of hyphens getting stuck onto the end of terms...
  91 inline bool
  92 is_suffix(unsigned ch) {
  93     return ch == '+' || ch == '#';
  94 }
  95
  96 inline bool
  97 is_double_quote(unsigned ch) {
  98     // We simply treat all double quotes as equivalent, which is a bit crude,
  99     // but it isn't clear that it would actually better to require them to
 100     // match up exactly.
 101     //
 102     // 0x201c is Unicode opening double quote.
 103     // 0x201d is Unicode closing double quote.
 104     return ch == '"' || ch == 0x201c || ch == 0x201d;
 105 }
 106
 107 inline bool
 108 prefix_needs_colon(const string & prefix, unsigned ch)
 109 {
 110     if (!U_isupper(ch) && ch != ':') return false;
 111     string::size_type len = prefix.length();
 112     return (len > 1 && prefix[len - 1] != ':');
 113 }
 114
 115 using Unicode::is_currency;
 116
 117 inline bool
 118 is_positional(Xapian::Query::op op)
 119 {
 120     return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
 121 }
 122
 123 class Terms;
 124
 125 /** Class used to pass information about a token from lexer to parser.
 126  *
 127  *  Generally an instance of this class carries term information, but it can be
 128  *  used for a range query, and with some operators (e.g. the distance in
 129  *  NEAR/3 or ADJ/3, etc).
 130  */
 131 class Term {
 132     State * state;
 133
 134   public:
 135     string name;
 136     const FieldInfo * field_info;
 137     string unstemmed;
 138     QueryParser::stem_strategy stem;
 139     termpos pos;
 140     Query query;
 141
 142     Term(const string &name_, termpos pos_)
 143         : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
 144     explicit Term(const string &name_)
 145         : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
 146     Term(const string &name_, const FieldInfo * field_info_)
 147         : name(name_), field_info(field_info_),
 148           stem(QueryParser::STEM_NONE), pos(0) { }
 149     explicit Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
 150     Term(State * state_, const string &name_, const FieldInfo * field_info_,
 151          const string &unstemmed_,
 152          QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
 153          termpos pos_ = 0)
 154         : state(state_), name(name_), field_info(field_info_),
 155           unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
 156     // For RANGE tokens.
 157     Term(const Xapian::Query & q, const string & grouping)
 158         : name(grouping), query(q) { }
 159
 160     string make_term(const string & prefix) const;
 161
 162     void need_positions() {
 163         if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
 164     }
 165
 166     termpos get_termpos() const { return pos; }
 167
 168     string get_grouping() const {
 169         return field_info->grouping;
 170     }
 171
 172     Query * as_wildcarded_query(State * state) const;
 173
 174     /** Build a query for a term at the very end of the query string when
 175      *  FLAG_PARTIAL is in use.
 176      *
 177      *  This query should match documents containing any terms which start with
 178      *  the characters specified, but should give a higher score to exact
 179      *  matches (since the user might have finished typing - we simply don't
 180      *  know).
 181      */
 182     Query * as_partial_query(State * state_) const;
 183
 184     /** Build a query for a string of CJK characters. */
 185     Query * as_cjk_query() const;
 186
 187     /** Handle a CJK character string in a positional context. */
 188     void as_positional_cjk_term(Terms * terms) const;
 189
 190     /// Range query.
 191     Query as_range_query() const;
 192
 193     Query get_query() const;
 194
 195     Query get_query_with_synonyms() const;
 196
 197     Query get_query_with_auto_synonyms() const;
 198 };
 199
 200 /// Parser State shared between the lexer and the parser.
 201 class State {
 202     QueryParser::Internal * qpi;
 203
 204   public:
 205     Query query;
 206     const char* error = NULL;
 207     unsigned flags;
 208     Query::op effective_default_op;
 209
 210     State(QueryParser::Internal * qpi_, unsigned flags_)
 211         : qpi(qpi_), flags(flags_), effective_default_op(qpi_->default_op)
 212     {
 213         if ((flags & QueryParser::FLAG_NO_POSITIONS)) {
 214             if (is_positional(effective_default_op)) {
 215                 effective_default_op = Query::OP_AND;
 216                 }
 217         }
 218     }
 219
 220     string stem_term(const string &term) {
 221         return qpi->stemmer(term);
 222     }
 223
 224     void add_to_stoplist(const Term * term) {
 225         qpi->stoplist.push_back(term->name);
 226     }
 227
 228     void add_to_unstem(const string & term, const string & unstemmed) {
 229         qpi->unstem.insert(make_pair(term, unstemmed));
 230     }
 231
 232     Term * range(const string &a, const string &b) {
 233         for (auto i : qpi->rangeprocs) {
 234             Xapian::Query range_query = (i.proc)->check_range(a, b);
 235             Xapian::Query::op op = range_query.get_type();
 236             switch (op) {
 237                 case Xapian::Query::OP_INVALID:
 238                     break;
 239                 case Xapian::Query::OP_VALUE_RANGE:
 240                 case Xapian::Query::OP_VALUE_GE:
 241                 case Xapian::Query::OP_VALUE_LE:
 242                     if (i.default_grouping) {
 243                         Xapian::Internal::QueryValueBase * base =
 244                             static_cast<Xapian::Internal::QueryValueBase*>(
 245                                 range_query.internal.get());
 246                         Xapian::valueno slot = base->get_slot();
 247                         return new Term(range_query, str(slot));
 248                     }
 249                     // FALLTHRU
 250                 case Xapian::Query::LEAF_TERM:
 251                     return new Term(range_query, i.grouping);
 252                 default:
 253                     return new Term(range_query, string());
 254             }
 255         }
 256         return NULL;
 257     }
 258
 259     Query::op default_op() const {
 260         return effective_default_op;
 261     }
 262
 263     bool is_stopword(const Term *term) const {
 264         return qpi->stopper.get() && (*qpi->stopper)(term->name);
 265     }
 266
 267     Database get_database() const {
 268         return qpi->db;
 269     }
 270
 271     const Stopper * get_stopper() const {
 272         return qpi->stopper.get();
 273     }
 274
 275     size_t stoplist_size() const {
 276         return qpi->stoplist.size();
 277     }
 278
 279     void stoplist_resize(size_t s) {
 280         qpi->stoplist.resize(s);
 281     }
 282
 283     Xapian::termcount get_max_wildcard_expansion() const {
 284         return qpi->max_wildcard_expansion;
 285     }
 286
 287     int get_max_wildcard_type() const {
 288         return qpi->max_wildcard_type;
 289     }
 290
 291     Xapian::termcount get_max_partial_expansion() const {
 292         return qpi->max_partial_expansion;
 293     }
 294
 295     int get_max_partial_type() const {
 296         return qpi->max_partial_type;
 297     }
 298 };
 299
 300 string
 301 Term::make_term(const string & prefix) const
 302 {
 303     string term;
 304     if (stem != QueryParser::STEM_NONE && stem != QueryParser::STEM_ALL)
 305         term += 'Z';
 306     if (!prefix.empty()) {
 307         term += prefix;
 308         if (prefix_needs_colon(prefix, name[0])) term += ':';
 309     }
 310     if (stem != QueryParser::STEM_NONE) {
 311         term += state->stem_term(name);
 312     } else {
 313         term += name;
 314     }
 315
 316     if (!unstemmed.empty())
 317         state->add_to_unstem(term, unstemmed);
 318     return term;
 319 }
 320
 321 // Iterator shim to allow building a synonym query from a TermIterator pair.
 322 class SynonymIterator {
 323     Xapian::TermIterator i;
 324
 325     Xapian::termpos pos;
 326
 327     const Xapian::Query * first;
 328
 329   public:
 330     SynonymIterator(const Xapian::TermIterator & i_,
 331                     Xapian::termpos pos_ = 0,
 332                     const Xapian::Query * first_ = NULL)
 333         : i(i_), pos(pos_), first(first_) { }
 334
 335     SynonymIterator & operator++() {
 336         if (first)
 337             first = NULL;
 338         else
 339             ++i;
 340         return *this;
 341     }
 342
 343     const Xapian::Query operator*() const {
 344         if (first) return *first;
 345         return Xapian::Query(*i, 1, pos);
 346     }
 347
 348     bool operator==(const SynonymIterator & o) const {
 349         return i == o.i && first == o.first;
 350     }
 351
 352     bool operator!=(const SynonymIterator & o) const {
 353         return !(*this == o);
 354     }
 355
 356     typedef std::input_iterator_tag iterator_category;
 357     typedef Xapian::Query value_type;
 358     typedef Xapian::termcount_diff difference_type;
 359     typedef Xapian::Query * pointer;
 360     typedef Xapian::Query & reference;
 361 };
 362
 363 Query
 364 Term::get_query_with_synonyms() const
 365 {
 366     // Handle single-word synonyms with each prefix.
 367     const auto& prefixes = field_info->prefixes;
 368     if (prefixes.empty()) {
 369         Assert(field_info->proc.get());
 370         return (*field_info->proc)(name);
 371     }
 372
 373     Query q = get_query();
 374
 375     for (auto&& prefix : prefixes) {
 376         // First try the unstemmed term:
 377         string term;
 378         if (!prefix.empty()) {
 379             term += prefix;
 380             if (prefix_needs_colon(prefix, name[0])) term += ':';
 381         }
 382         term += name;
 383
 384         Xapian::Database db = state->get_database();
 385         Xapian::TermIterator syn = db.synonyms_begin(term);
 386         Xapian::TermIterator end = db.synonyms_end(term);
 387         if (syn == end && stem != QueryParser::STEM_NONE) {
 388             // If that has no synonyms, try the stemmed form:
 389             term = 'Z';
 390             if (!prefix.empty()) {
 391                 term += prefix;
 392                 if (prefix_needs_colon(prefix, name[0])) term += ':';
 393             }
 394             term += state->stem_term(name);
 395             syn = db.synonyms_begin(term);
 396             end = db.synonyms_end(term);
 397         }
 398         q = Query(q.OP_SYNONYM,
 399                   SynonymIterator(syn, pos, &q),
 400                   SynonymIterator(end));
 401     }
 402     return q;
 403 }
 404
 405 Query
 406 Term::get_query_with_auto_synonyms() const
 407 {
 408     const unsigned MASK_ENABLE_AUTO_SYNONYMS =
 409         QueryParser::FLAG_AUTO_SYNONYMS |
 410         QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 411     if (state->flags & MASK_ENABLE_AUTO_SYNONYMS)
 412         return get_query_with_synonyms();
 413
 414     return get_query();
 415 }
 416
 417 static void
 418 add_to_query(Query *& q, Query::op op, Query * term)
 419 {
 420     Assert(term);
 421     if (q) {
 422         if (op == Query::OP_OR) {
 423             *q |= *term;
 424         } else if (op == Query::OP_AND) {
 425             *q &= *term;
 426         } else {
 427             *q = Query(op, *q, *term);
 428         }
 429         delete term;
 430     } else {
 431         q = term;
 432     }
 433 }
 434
 435 static void
 436 add_to_query(Query *& q, Query::op op, const Query & term)
 437 {
 438     if (q) {
 439         if (op == Query::OP_OR) {
 440             *q |= term;
 441         } else if (op == Query::OP_AND) {
 442             *q &= term;
 443         } else {
 444             *q = Query(op, *q, term);
 445         }
 446     } else {
 447         q = new Query(term);
 448     }
 449 }
 450
 451 Query
 452 Term::get_query() const
 453 {
 454     const auto& prefixes = field_info->prefixes;
 455     if (prefixes.empty()) {
 456         Assert(field_info->proc.get());
 457         return (*field_info->proc)(name);
 458     }
 459     auto piter = prefixes.begin();
 460     Query q(make_term(*piter), 1, pos);
 461     while (++piter != prefixes.end()) {
 462         q |= Query(make_term(*piter), 1, pos);
 463     }
 464     return q;
 465 }
 466
 467 Query *
 468 Term::as_wildcarded_query(State * state_) const
 469 {
 470     const auto& prefixes = field_info->prefixes;
 471     Xapian::termcount max = state_->get_max_wildcard_expansion();
 472     int max_type = state_->get_max_wildcard_type();
 473     vector<Query> subqs;
 474     subqs.reserve(prefixes.size());
 475     for (string root : prefixes) {
 476         root += name;
 477         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 478         subqs.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 479                               Query::OP_OR));
 480     }
 481     Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
 482     delete this;
 483     return q;
 484 }
 485
 486 Query *
 487 Term::as_partial_query(State * state_) const
 488 {
 489     Xapian::termcount max = state_->get_max_partial_expansion();
 490     int max_type = state_->get_max_partial_type();
 491     vector<Query> subqs_partial; // A synonym of all the partial terms.
 492     vector<Query> subqs_full; // A synonym of all the full terms.
 493
 494     for (const string& prefix : field_info->prefixes) {
 495         string root = prefix;
 496         root += name;
 497         // Combine with OP_OR, and apply OP_SYNONYM afterwards.
 498         subqs_partial.push_back(Query(Query::OP_WILDCARD, root, max, max_type,
 499                                       Query::OP_OR));
 500         // Add the term, as it would normally be handled, as an alternative.
 501         subqs_full.push_back(Query(make_term(prefix), 1, pos));
 502     }
 503     Query * q = new Query(Query::OP_OR,
 504                           Query(Query::OP_SYNONYM,
 505                                 subqs_partial.begin(), subqs_partial.end()),
 506                           Query(Query::OP_SYNONYM,
 507                                 subqs_full.begin(), subqs_full.end()));
 508     delete this;
 509     return q;
 510 }
 511
 512 Query *
 513 Term::as_cjk_query() const
 514 {
 515     vector<Query> prefix_subqs;
 516     vector<Query> cjk_subqs;
 517     const auto& prefixes = field_info->prefixes;
 518     for (const string& prefix : prefixes) {
 519         for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
 520             cjk_subqs.push_back(Query(prefix + *tk, 1, pos));
 521         }
 522         prefix_subqs.push_back(Query(Query::OP_AND,
 523                                      cjk_subqs.begin(), cjk_subqs.end()));
 524         cjk_subqs.clear();
 525     }
 526     Query * q = new Query(Query::OP_OR,
 527                           prefix_subqs.begin(), prefix_subqs.end());
 528     delete this;
 529     return q;
 530 }
 531
 532 Query
 533 Term::as_range_query() const
 534 {
 535     Query q = query;
 536     delete this;
 537     return q;
 538 }
 539
 540 inline bool
 541 is_phrase_generator(unsigned ch)
 542 {
 543     // These characters generate a phrase search.
 544     // Ordered mostly by frequency of calls to this function done when
 545     // running the testcases in api_queryparser.cc.
 546     return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
 547 }
 548
 549 inline bool
 550 is_stem_preventer(unsigned ch)
 551 {
 552     return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
 553 }
 554
 555 inline bool
 556 should_stem(const string & term)
 557 {
 558     const unsigned int SHOULD_STEM_MASK =
 559         (1 << Unicode::LOWERCASE_LETTER) |
 560         (1 << Unicode::TITLECASE_LETTER) |
 561         (1 << Unicode::MODIFIER_LETTER) |
 562         (1 << Unicode::OTHER_LETTER);
 563     Utf8Iterator u(term);
 564     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
 565 }
 566
 567 /** Value representing "ignore this" when returned by check_infix() or
 568  *  check_infix_digit().
 569  */
 570 const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
 571
 572 inline unsigned check_infix(unsigned ch) {
 573     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
 574         // Unicode includes all these except '&' in its word boundary rules,
 575         // as well as 0x2019 (which we handle below) and ':' (for Swedish
 576         // apparently, but we ignore this for now as it's problematic in
 577         // real world cases).
 578         return ch;
 579     }
 580     if (ch >= 0x200b) {
 581         // 0x2019 is Unicode apostrophe and single closing quote.
 582         // 0x201b is Unicode single opening quote with the tail rising.
 583         if (ch == 0x2019 || ch == 0x201b)
 584             return '\'';
 585         if (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff)
 586             return UNICODE_IGNORE;
 587     }
 588     return 0;
 589 }
 590
 591 inline unsigned check_infix_digit(unsigned ch) {
 592     // This list of characters comes from Unicode's word identifying algorithm.
 593     switch (ch) {
 594         case ',':
 595         case '.':
 596         case ';':
 597         case 0x037e: // GREEK QUESTION MARK
 598         case 0x0589: // ARMENIAN FULL STOP
 599         case 0x060D: // ARABIC DATE SEPARATOR
 600         case 0x07F8: // NKO COMMA
 601         case 0x2044: // FRACTION SLASH
 602         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 603         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 604         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 605             return ch;
 606     }
 607     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 608         return UNICODE_IGNORE;
 609     return 0;
 610 }
 611
 612 // Prototype a function lemon generates, but which we want to call before that
 613 // in the generated source code file.
 614 struct yyParser;
 615 static void yy_parse_failed(yyParser *);
 616
 617 void
 618 QueryParser::Internal::add_prefix(const string &field, const string &prefix)
 619 {
 620     map<string, FieldInfo>::iterator p = field_map.find(field);
 621     if (p == field_map.end()) {
 622         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, prefix)));
 623     } else {
 624         // Check that this is the same type of filter as the existing one(s).
 625         if (p->second.type != NON_BOOLEAN) {
 626             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 627         }
 628         if (p->second.proc.get())
 629             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 630         p->second.prefixes.push_back(prefix);
 631    }
 632 }
 633
 634 void
 635 QueryParser::Internal::add_prefix(const string &field, FieldProcessor *proc)
 636 {
 637     map<string, FieldInfo>::iterator p = field_map.find(field);
 638     if (p == field_map.end()) {
 639         field_map.insert(make_pair(field, FieldInfo(NON_BOOLEAN, proc)));
 640     } else {
 641         // Check that this is the same type of filter as the existing one(s).
 642         if (p->second.type != NON_BOOLEAN) {
 643             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
 644         }
 645         if (!p->second.prefixes.empty())
 646             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 647         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 648    }
 649 }
 650
 651 void
 652 QueryParser::Internal::add_boolean_prefix(const string &field,
 653                                           const string &prefix,
 654                                           const string* grouping)
 655 {
 656     // Don't allow the empty prefix to be set as boolean as it doesn't
 657     // really make sense.
 658     if (field.empty())
 659         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 660     if (!grouping) grouping = &field;
 661     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 662     map<string, FieldInfo>::iterator p = field_map.find(field);
 663     if (p == field_map.end()) {
 664         field_map.insert(make_pair(field, FieldInfo(type, prefix, *grouping)));
 665     } else {
 666         // Check that this is the same type of filter as the existing one(s).
 667         if (p->second.type != type) {
 668             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 669         }
 670         if (p->second.proc.get())
 671             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 672         p->second.prefixes.push_back(prefix); // FIXME grouping
 673    }
 674 }
 675
 676 void
 677 QueryParser::Internal::add_boolean_prefix(const string &field,
 678                                           FieldProcessor *proc,
 679                                           const string* grouping)
 680 {
 681     // Don't allow the empty prefix to be set as boolean as it doesn't
 682     // really make sense.
 683     if (field.empty())
 684         throw Xapian::UnimplementedError("Can't set the empty prefix to be a boolean filter");
 685     if (!grouping) grouping = &field;
 686     filter_type type = grouping->empty() ? BOOLEAN : BOOLEAN_EXCLUSIVE;
 687     map<string, FieldInfo>::iterator p = field_map.find(field);
 688     if (p == field_map.end()) {
 689         field_map.insert(make_pair(field, FieldInfo(type, proc, *grouping)));
 690     } else {
 691         // Check that this is the same type of filter as the existing one(s).
 692         if (p->second.type != type) {
 693             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter"); // FIXME
 694         }
 695         if (!p->second.prefixes.empty())
 696             throw Xapian::FeatureUnavailableError("Mixing FieldProcessor objects and string prefixes currently not supported");
 697         throw Xapian::FeatureUnavailableError("Multiple FieldProcessor objects for the same prefix currently not supported");
 698    }
 699 }
 700
 701 string
 702 QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
 703                                   bool cjk_ngram, bool & is_cjk_term,
 704                                   bool &was_acronym)
 705 {
 706     string term;
 707     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 708     // Don't worry if there's a trailing '.' or not.
 709     if (U_isupper(*it)) {
 710         string t;
 711         Utf8Iterator p = it;
 712         do {
 713             Unicode::append_utf8(t, *p++);
 714         } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 715         // One letter does not make an acronym!  If we handled a single
 716         // uppercase letter here, we wouldn't catch M&S below.
 717         if (t.length() > 1) {
 718             // Check there's not a (lower case) letter or digit
 719             // immediately after it.
 720             // FIXME: should I.B.M..P.T.O be a range search?
 721             if (p == end || !is_wordchar(*p)) {
 722                 it = p;
 723                 swap(term, t);
 724             }
 725         }
 726     }
 727     was_acronym = !term.empty();
 728
 729     if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
 730         const char* cjk = it.raw();
 731         CJK::get_cjk(it);
 732         term.assign(cjk, it.raw() - cjk);
 733         is_cjk_term = true;
 734     }
 735
 736     if (term.empty()) {
 737         unsigned prevch = *it;
 738         Unicode::append_utf8(term, prevch);
 739         while (++it != end) {
 740             if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
 741             unsigned ch = *it;
 742             if (!is_wordchar(ch)) {
 743                 // Treat a single embedded '&' or "'" or similar as a word
 744                 // character (e.g. AT&T, Fred's).  Also, normalise
 745                 // apostrophes to ASCII apostrophe.
 746                 Utf8Iterator p = it;
 747                 ++p;
 748                 if (p == end || !is_wordchar(*p)) break;
 749                 unsigned nextch = *p;
 750                 if (is_digit(prevch) && is_digit(nextch)) {
 751                     ch = check_infix_digit(ch);
 752                 } else {
 753                     ch = check_infix(ch);
 754                 }
 755                 if (!ch) break;
 756                 if (ch == UNICODE_IGNORE)
 757                     continue;
 758             }
 759             Unicode::append_utf8(term, ch);
 760             prevch = ch;
 761         }
 762         if (it != end && is_suffix(*it)) {
 763             string suff_term = term;
 764             Utf8Iterator p = it;
 765             // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
 766             do {
 767                 if (suff_term.size() - term.size() == 3) {
 768                     suff_term.resize(0);
 769                     break;
 770                 }
 771                 suff_term += *p;
 772             } while (is_suffix(*++p));
 773             if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
 774                 // If the suffixed term doesn't exist, check that the
 775                 // non-suffixed term does.  This also takes care of
 776                 // the case when QueryParser::set_database() hasn't
 777                 // been called.
 778                 bool use_suff_term = false;
 779                 string lc = Unicode::tolower(suff_term);
 780                 if (db.term_exists(lc)) {
 781                     use_suff_term = true;
 782                 } else {
 783                     lc = Unicode::tolower(term);
 784                     if (!db.term_exists(lc)) use_suff_term = true;
 785                 }
 786                 if (use_suff_term) {
 787                     term = suff_term;
 788                     it = p;
 789                 }
 790             }
 791         }
 792     }
 793     return term;
 794 }
 795
 796 }
 797 // Switch to %code to insert at the end of the file so struct yyParser has been
 798 // defined.
 799 %code {
 800
 801 Query
 802 QueryParser::Internal::parse_query(const string &qs, unsigned flags,
 803                                    const string &default_prefix)
 804 {
 805     bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
 806
 807     // Set ranges if we may have to handle ranges in the query.
 808     bool ranges = !rangeprocs.empty() && (qs.find("..") != string::npos);
 809
 810     termpos term_pos = 1;
 811     Utf8Iterator it(qs), end;
 812
 813     State state(this, flags);
 814
 815     // To successfully apply more than one spelling correction to a query
 816     // string, we must keep track of the offset due to previous corrections.
 817     int correction_offset = 0;
 818     corrected_query.resize(0);
 819
 820     // Stack of prefixes, used for phrases and subexpressions.
 821     list<const FieldInfo *> prefix_stack;
 822
 823     // If default_prefix is specified, use it.  Otherwise, use any list
 824     // that has been set for the empty prefix.
 825     const FieldInfo def_pfx(NON_BOOLEAN, default_prefix);
 826     {
 827         const FieldInfo * default_field_info = &def_pfx;
 828         if (default_prefix.empty()) {
 829             auto f = field_map.find(string());
 830             if (f != field_map.end()) default_field_info = &(f->second);
 831         }
 832
 833         // We always have the current prefix on the top of the stack.
 834         prefix_stack.push_back(default_field_info);
 835     }
 836
 837     yyParser parser;
 838
 839     unsigned newprev = ' ';
 840 main_lex_loop:
 841     enum {
 842         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
 843         IN_GROUP2, EXPLICIT_SYNONYM
 844     } mode = DEFAULT;
 845     while (it != end && !state.error) {
 846         bool last_was_operator = false;
 847         bool last_was_operator_needing_term = false;
 848         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
 849         if (false) {
 850 just_had_operator:
 851             if (it == end) break;
 852             mode = DEFAULT;
 853             last_was_operator_needing_term = false;
 854             last_was_operator = true;
 855         }
 856         if (false) {
 857 just_had_operator_needing_term:
 858             last_was_operator_needing_term = true;
 859             last_was_operator = true;
 860         }
 861         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
 862         if (is_whitespace(*it)) {
 863             newprev = ' ';
 864             ++it;
 865             it = find_if(it, end, is_not_whitespace);
 866             if (it == end) break;
 867         }
 868
 869         if (ranges &&
 870             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
 871             // Scan forward to see if this could be the "start of range"
 872             // token.  Sadly this has O(n²) tendencies, though at least
 873             // "n" is the number of words in a query which is likely to
 874             // remain fairly small.  FIXME: can we tokenise more elegantly?
 875             Utf8Iterator it_initial = it;
 876             Utf8Iterator p = it;
 877             unsigned ch = 0;
 878             while (p != end) {
 879                 if (ch == '.' && *p == '.') {
 880                     string a;
 881                     while (it != p) {
 882                         Unicode::append_utf8(a, *it++);
 883                     }
 884                     // Trim off the trailing ".".
 885                     a.resize(a.size() - 1);
 886                     ++p;
 887                     // Either end of the range can be empty (for an open-ended
 888                     // range) but both can't be empty.
 889                     if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
 890                         string b;
 891                         // Allow any character except whitespace and ')' in the
 892                         // upper bound.
 893                         while (p != end && *p > ' ' && *p != ')') {
 894                             Unicode::append_utf8(b, *p++);
 895                         }
 896                         Term * range = state.range(a, b);
 897                         if (!range) {
 898                             state.error = "Unknown range operation";
 899                             if (a.find(':', 1) == string::npos) {
 900                                 goto done;
 901                             }
 902                             // Might be a boolean filter with ".." in.  Leave
 903                             // state.error in case it isn't.
 904                             it = it_initial;
 905                             break;
 906                         }
 907                         Parse(&parser, RANGE, range, &state);
 908                     }
 909                     it = p;
 910                     goto main_lex_loop;
 911                 }
 912                 ch = *p;
 913                 // Allow any character except whitespace and '(' in the lower
 914                 // bound.
 915                 if (ch <= ' ' || ch == '(') break;
 916                 ++p;
 917             }
 918         }
 919
 920         if (!is_wordchar(*it)) {
 921             unsigned prev = newprev;
 922             unsigned ch = *it++;
 923             newprev = ch;
 924             // Drop out of IN_GROUP mode.
 925             if (mode == IN_GROUP || mode == IN_GROUP2)
 926                 mode = DEFAULT;
 927             switch (ch) {
 928               case '"':
 929               case 0x201c: // Left curly double quote.
 930               case 0x201d: // Right curly double quote.
 931                 // Quoted phrase.
 932                 if (mode == DEFAULT) {
 933                     // Skip whitespace.
 934                     it = find_if(it, end, is_not_whitespace);
 935                     if (it == end) {
 936                         // Ignore an unmatched " at the end of the query to
 937                         // avoid generating an empty pair of QUOTEs which will
 938                         // cause a parse error.
 939                         goto done;
 940                     }
 941                     if (is_double_quote(*it)) {
 942                         // Ignore empty "" (but only if we're not already
 943                         // IN_QUOTES as we don't merge two adjacent quoted
 944                         // phrases!)
 945                         newprev = *it++;
 946                         break;
 947                     }
 948                 }
 949                 if (flags & QueryParser::FLAG_PHRASE) {
 950                     if (ch == '"' && it != end && *it == '"') {
 951                         ++it;
 952                         // Handle "" inside a quoted phrase as an escaped " for
 953                         // consistency with quoted boolean terms.
 954                         break;
 955                     }
 956                     Parse(&parser, QUOTE, NULL, &state);
 957                     if (mode == DEFAULT) {
 958                         mode = IN_QUOTES;
 959                     } else {
 960                         // Remove the prefix we pushed for this phrase.
 961                         if (mode == IN_PREFIXED_QUOTES)
 962                             prefix_stack.pop_back();
 963                         mode = DEFAULT;
 964                     }
 965                 }
 966                 break;
 967
 968               case '+': case '-': // Loved or hated term/phrase/subexpression.
 969                 // Ignore + or - at the end of the query string.
 970                 if (it == end) goto done;
 971                 if (prev > ' ' && prev != '(') {
 972                     // Or if not after whitespace or an open bracket.
 973                     break;
 974                 }
 975                 if (is_whitespace(*it) || *it == '+' || *it == '-') {
 976                     // Ignore + or - followed by a space, or further + or -.
 977                     // Postfix + (such as in C++ and H+) is handled as part of
 978                     // the term lexing code in parse_term().
 979                     newprev = *it++;
 980                     break;
 981                 }
 982                 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
 983                     int token;
 984                     if (ch == '+') {
 985                         token = LOVE;
 986                     } else if (last_was_operator) {
 987                         token = HATE_AFTER_AND;
 988                     } else {
 989                         token = HATE;
 990                     }
 991                     Parse(&parser, token, NULL, &state);
 992                     goto just_had_operator_needing_term;
 993                 }
 994                 // Need to prevent the term after a LOVE or HATE starting a
 995                 // term group...
 996                 break;
 997
 998               case '(': // Bracketed subexpression.
 999                 // Skip whitespace.
1000                 it = find_if(it, end, is_not_whitespace);
1001                 // Ignore ( at the end of the query string.
1002                 if (it == end) goto done;
1003                 if (prev > ' ' && strchr("()+-", prev) == NULL) {
1004                     // Or if not after whitespace or a bracket or '+' or '-'.
1005                     break;
1006                 }
1007                 if (*it == ')') {
1008                     // Ignore empty ().
1009                     newprev = *it++;
1010                     break;
1011                 }
1012                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1013                     prefix_stack.push_back(prefix_stack.back());
1014                     Parse(&parser, BRA, NULL, &state);
1015                 }
1016                 break;
1017
1018               case ')': // End of bracketed subexpression.
1019                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
1020                     // Remove the prefix we pushed for the corresponding BRA.
1021                     // If brackets are unmatched, it's a syntax error, but
1022                     // that's no excuse to SEGV!
1023                     if (prefix_stack.size() > 1) prefix_stack.pop_back();
1024                     Parse(&parser, KET, NULL, &state);
1025                 }
1026                 break;
1027
1028               case '~': // Synonym expansion.
1029                 // Ignore at the end of the query string.
1030                 if (it == end) goto done;
1031                 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
1032                     if (prev > ' ' && strchr("+-(", prev) == NULL) {
1033                         // Or if not after whitespace, +, -, or an open bracket.
1034                         break;
1035                     }
1036                     if (!is_wordchar(*it)) {
1037                         // Ignore if not followed by a word character.
1038                         break;
1039                     }
1040                     Parse(&parser, SYNONYM, NULL, &state);
1041                     mode = EXPLICIT_SYNONYM;
1042                     goto just_had_operator_needing_term;
1043                 }
1044                 break;
1045             }
1046             // Skip any other characters.
1047             continue;
1048         }
1049
1050         Assert(is_wordchar(*it));
1051
1052         size_t term_start_index = it.raw() - qs.data();
1053
1054         newprev = 'A'; // Any letter will do...
1055
1056         // A term, a prefix, or a boolean operator.
1057         const FieldInfo * field_info = NULL;
1058         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
1059             !field_map.empty()) {
1060             // Check for a fieldname prefix (e.g. title:historical).
1061             Utf8Iterator p = find_if(it, end, is_not_wordchar);
1062             if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
1063                 string field;
1064                 p = it;
1065                 while (*p != ':')
1066                     Unicode::append_utf8(field, *p++);
1067                 map<string, FieldInfo>::const_iterator f;
1068                 f = field_map.find(field);
1069                 if (f != field_map.end()) {
1070                     // Special handling for prefixed fields, depending on the
1071                     // type of the prefix.
1072                     unsigned ch = *++p;
1073                     field_info = &(f->second);
1074
1075                     if (field_info->type != NON_BOOLEAN) {
1076                         // Drop out of IN_GROUP if we're in it.
1077                         if (mode == IN_GROUP || mode == IN_GROUP2)
1078                             mode = DEFAULT;
1079                         it = p;
1080                         string name;
1081                         if (it != end && is_double_quote(*it)) {
1082                             // Quoted boolean term (can contain any character).
1083                             bool fancy = (*it != '"');
1084                             ++it;
1085                             while (it != end) {
1086                                 if (*it == '"') {
1087                                     // Interpret "" as an escaped ".
1088                                     if (++it == end || *it != '"')
1089                                         break;
1090                                 } else if (fancy && is_double_quote(*it)) {
1091                                     // If the opening quote was ASCII, then the
1092                                     // closing one must be too - otherwise
1093                                     // the user can't protect non-ASCII double
1094                                     // quote characters by quoting or escaping.
1095                                     ++it;
1096                                     break;
1097                                 }
1098                                 Unicode::append_utf8(name, *it++);
1099                             }
1100                         } else {
1101                             // Can't boolean filter prefix a subexpression, so
1102                             // just use anything following the prefix until the
1103                             // next space or ')' as part of the boolean filter
1104                             // term.
1105                             while (it != end && *it > ' ' && *it != ')')
1106                                 Unicode::append_utf8(name, *it++);
1107                         }
1108                         // Build the unstemmed form in field.
1109                         field += ':';
1110                         field += name;
1111                         // Clear any pending range error.
1112                         state.error = NULL;
1113                         Term * token = new Term(&state, name, field_info, field);
1114                         Parse(&parser, BOOLEAN_FILTER, token, &state);
1115                         continue;
1116                     }
1117
1118                     if ((flags & FLAG_PHRASE) && is_double_quote(ch)) {
1119                         // Prefixed phrase, e.g.: subject:"space flight"
1120                         mode = IN_PREFIXED_QUOTES;
1121                         Parse(&parser, QUOTE, NULL, &state);
1122                         it = p;
1123                         newprev = ch;
1124                         ++it;
1125                         prefix_stack.push_back(field_info);
1126                         continue;
1127                     }
1128
1129                     if (ch == '(' && (flags & FLAG_BOOLEAN)) {
1130                         // Prefixed subexpression, e.g.: title:(fast NEAR food)
1131                         mode = DEFAULT;
1132                         Parse(&parser, BRA, NULL, &state);
1133                         it = p;
1134                         newprev = ch;
1135                         ++it;
1136                         prefix_stack.push_back(field_info);
1137                         continue;
1138                     }
1139
1140                     if (ch != ':') {
1141                         // Allow 'path:/usr/local' but not 'foo::bar::baz'.
1142                         while (is_phrase_generator(ch)) {
1143                             if (++p == end)
1144                                 goto not_prefix;
1145                             ch = *p;
1146                         }
1147                     }
1148
1149                     if (is_wordchar(ch)) {
1150                         // Prefixed term.
1151                         it = p;
1152                     } else {
1153 not_prefix:
1154                         // It looks like a prefix but isn't, so parse it as
1155                         // text instead.
1156                         field_info = NULL;
1157                     }
1158                 }
1159             }
1160         }
1161
1162 phrased_term:
1163         bool was_acronym;
1164         bool is_cjk_term = false;
1165         string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
1166
1167         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
1168             (flags & FLAG_BOOLEAN) &&
1169             // Don't want to interpret A.N.D. as an AND operator.
1170             !was_acronym &&
1171             !field_info &&
1172             term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
1173             // Boolean operators.
1174             string op = term;
1175             if (flags & FLAG_BOOLEAN_ANY_CASE) {
1176                 for (string::iterator i = op.begin(); i != op.end(); ++i) {
1177                     *i = C_toupper(*i);
1178                 }
1179             }
1180             if (op.size() == 3) {
1181                 if (op == "AND") {
1182                     Parse(&parser, AND, NULL, &state);
1183                     goto just_had_operator;
1184                 }
1185                 if (op == "NOT") {
1186                     Parse(&parser, NOT, NULL, &state);
1187                     goto just_had_operator;
1188                 }
1189                 if (op == "XOR") {
1190                     Parse(&parser, XOR, NULL, &state);
1191                     goto just_had_operator;
1192                 }
1193                 if (op == "ADJ") {
1194                     if (it != end && *it == '/') {
1195                         size_t width = 0;
1196                         Utf8Iterator p = it;
1197                         while (++p != end && U_isdigit(*p)) {
1198                             width = (width * 10) + (*p - '0');
1199                         }
1200                         if (width && (p == end || is_whitespace(*p))) {
1201                             it = p;
1202                             Parse(&parser, ADJ, new Term(width), &state);
1203                             goto just_had_operator;
1204                         }
1205                     } else {
1206                         Parse(&parser, ADJ, NULL, &state);
1207                         goto just_had_operator;
1208                     }
1209                 }
1210             } else if (op.size() == 2) {
1211                 if (op == "OR") {
1212                     Parse(&parser, OR, NULL, &state);
1213                     goto just_had_operator;
1214                 }
1215             } else if (op.size() == 4) {
1216                 if (op == "NEAR") {
1217                     if (it != end && *it == '/') {
1218                         size_t width = 0;
1219                         Utf8Iterator p = it;
1220                         while (++p != end && U_isdigit(*p)) {
1221                             width = (width * 10) + (*p - '0');
1222                         }
1223                         if (width && (p == end || is_whitespace(*p))) {
1224                             it = p;
1225                             Parse(&parser, NEAR, new Term(width), &state);
1226                             goto just_had_operator;
1227                         }
1228                     } else {
1229                         Parse(&parser, NEAR, NULL, &state);
1230                         goto just_had_operator;
1231                     }
1232                 }
1233             }
1234         }
1235
1236         // If no prefix is set, use the default one.
1237         if (!field_info) field_info = prefix_stack.back();
1238
1239         Assert(field_info->type == NON_BOOLEAN);
1240
1241         {
1242             string unstemmed_term(term);
1243             term = Unicode::tolower(term);
1244
1245             // Reuse stem_strategy - STEM_SOME here means "stem terms except
1246             // when used with positional operators".
1247             stem_strategy stem_term = stem_action;
1248             if (stem_term != STEM_NONE) {
1249                 if (stemmer.is_none()) {
1250                     stem_term = STEM_NONE;
1251                 } else if (stem_term == STEM_SOME ||
1252                            stem_term == STEM_SOME_FULL_POS) {
1253                     if (!should_stem(unstemmed_term) ||
1254                         (it != end && is_stem_preventer(*it))) {
1255                         // Don't stem this particular term.
1256                         stem_term = STEM_NONE;
1257                     }
1258                 }
1259             }
1260
1261             Term * term_obj = new Term(&state, term, field_info,
1262                                        unstemmed_term, stem_term, term_pos++);
1263
1264             if (is_cjk_term) {
1265                 Parse(&parser, CJKTERM, term_obj, &state);
1266                 if (it == end) break;
1267                 continue;
1268             }
1269
1270             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1271                 if (it != end) {
1272                     if ((flags & FLAG_WILDCARD) && *it == '*') {
1273                         Utf8Iterator p(it);
1274                         ++p;
1275                         if (p == end || !is_wordchar(*p)) {
1276                             it = p;
1277                             if (mode == IN_GROUP || mode == IN_GROUP2) {
1278                                 // Drop out of IN_GROUP and flag that the group
1279                                 // can be empty if all members are stopwords.
1280                                 if (mode == IN_GROUP2)
1281                                     Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1282                                 mode = DEFAULT;
1283                             }
1284                             // Wildcard at end of term (also known as
1285                             // "right truncation").
1286                             Parse(&parser, WILD_TERM, term_obj, &state);
1287                             continue;
1288                         }
1289                     }
1290                 } else {
1291                     if (flags & FLAG_PARTIAL) {
1292                         if (mode == IN_GROUP || mode == IN_GROUP2) {
1293                             // Drop out of IN_GROUP and flag that the group
1294                             // can be empty if all members are stopwords.
1295                             if (mode == IN_GROUP2)
1296                                 Parse(&parser, EMPTY_GROUP_OK, NULL, &state);
1297                             mode = DEFAULT;
1298                         }
1299                         // Final term of a partial match query, with no
1300                         // following characters - treat as a wildcard.
1301                         Parse(&parser, PARTIAL_TERM, term_obj, &state);
1302                         continue;
1303                     }
1304                 }
1305             }
1306
1307             // Check spelling, if we're a normal term, and any of the prefixes
1308             // are empty.
1309             if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
1310                 const auto& prefixes = field_info->prefixes;
1311                 for (const string& prefix : prefixes) {
1312                     if (!prefix.empty())
1313                         continue;
1314                     const string & suggest = db.get_spelling_suggestion(term);
1315                     if (!suggest.empty()) {
1316                         if (corrected_query.empty()) corrected_query = qs;
1317                         size_t term_end_index = it.raw() - qs.data();
1318                         size_t n = term_end_index - term_start_index;
1319                         size_t pos = term_start_index + correction_offset;
1320                         corrected_query.replace(pos, n, suggest);
1321                         correction_offset += suggest.size();
1322                         correction_offset -= n;
1323                     }
1324                     break;
1325                 }
1326             }
1327
1328             if (mode == IN_PHRASED_TERM) {
1329                 Parse(&parser, PHR_TERM, term_obj, &state);
1330             } else {
1331                 // See if the next token will be PHR_TERM - if so, this one
1332                 // needs to be TERM not GROUP_TERM.
1333                 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
1334                     is_phrase_generator(*it)) {
1335                     // FIXME: can we clean this up?
1336                     Utf8Iterator p = it;
1337                     do {
1338                         ++p;
1339                     } while (p != end && is_phrase_generator(*p));
1340                     // Don't generate a phrase unless the phrase generators are
1341                     // immediately followed by another term.
1342                     if (p != end && is_wordchar(*p)) {
1343                         mode = DEFAULT;
1344                     }
1345                 }
1346
1347                 int token = TERM;
1348                 if (mode == IN_GROUP || mode == IN_GROUP2) {
1349                     mode = IN_GROUP2;
1350                     token = GROUP_TERM;
1351                 }
1352                 Parse(&parser, token, term_obj, &state);
1353                 if (token == TERM && mode != DEFAULT)
1354                     continue;
1355             }
1356         }
1357
1358         if (it == end) break;
1359
1360         if (is_phrase_generator(*it)) {
1361             // Skip multiple phrase generators.
1362             do {
1363                 ++it;
1364             } while (it != end && is_phrase_generator(*it));
1365             // Don't generate a phrase unless the phrase generators are
1366             // immediately followed by another term.
1367             if (it != end && is_wordchar(*it)) {
1368                 mode = IN_PHRASED_TERM;
1369                 term_start_index = it.raw() - qs.data();
1370                 goto phrased_term;
1371             }
1372         } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
1373             int old_mode = mode;
1374             mode = DEFAULT;
1375             if (!last_was_operator_needing_term && is_whitespace(*it)) {
1376                 newprev = ' ';
1377                 // Skip multiple whitespace.
1378                 do {
1379                     ++it;
1380                 } while (it != end && is_whitespace(*it));
1381                 // Don't generate a group unless the terms are only separated
1382                 // by whitespace.
1383                 if (it != end && is_wordchar(*it)) {
1384                     if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
1385                         mode = IN_GROUP2;
1386                     } else {
1387                         mode = IN_GROUP;
1388                     }
1389                 }
1390             }
1391         }
1392     }
1393 done:
1394     if (!state.error) {
1395         // Implicitly close any unclosed quotes.
1396         if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
1397             Parse(&parser, QUOTE, NULL, &state);
1398
1399         // Implicitly close all unclosed brackets.
1400         while (prefix_stack.size() > 1) {
1401             Parse(&parser, KET, NULL, &state);
1402             prefix_stack.pop_back();
1403         }
1404         Parse(&parser, 0, NULL, &state);
1405     }
1406
1407     errmsg = state.error;
1408     return state.query;
1409 }
1410
1411 }
1412 %include {
1413
1414 struct ProbQuery {
1415     Query* query = NULL;
1416     Query* love = NULL;
1417     Query* hate = NULL;
1418     // filter is a map from prefix to a query for that prefix.  Queries with
1419     // the same prefix are combined with OR, and the results of this are
1420     // combined with AND to get the full filter.
1421     map<string, Query> filter;
1422
1423     ProbQuery() {}
1424
1425     explicit
1426     ProbQuery(Query* query_) : query(query_) {}
1427
1428     ~ProbQuery() {
1429         delete query;
1430         delete love;
1431         delete hate;
1432     }
1433
1434     void add_filter(const string& grouping, const Query & q) {
1435         filter[grouping] = q;
1436     }
1437
1438     void append_filter(const string& grouping, const Query & qnew) {
1439         auto it = filter.find(grouping);
1440         if (it == filter.end()) {
1441             filter.insert(make_pair(grouping, qnew));
1442         } else {
1443             Query & q = it->second;
1444             // We OR multiple filters with the same prefix if they're
1445             // exclusive, otherwise we AND them.
1446             bool exclusive = !grouping.empty();
1447             if (exclusive) {
1448                 q |= qnew;
1449             } else {
1450                 q &= qnew;
1451             }
1452         }
1453     }
1454
1455     void add_filter_range(const string& grouping, const Query & range) {
1456         filter[grouping] = range;
1457     }
1458
1459     void append_filter_range(const string& grouping, const Query & range) {
1460         Query & q = filter[grouping];
1461         q |= range;
1462     }
1463
1464     Query merge_filters() const {
1465         auto i = filter.begin();
1466         Assert(i != filter.end());
1467         Query q = i->second;
1468         while (++i != filter.end()) {
1469             q &= i->second;
1470         }
1471         return q;
1472     }
1473 };
1474
1475 /// A group of terms separated only by whitespace.
1476 class TermGroup {
1477     vector<Term *> terms;
1478
1479     /** Controls how to handle a group where all terms are stopwords.
1480      *
1481      *  If true, then as_group() returns NULL.  If false, then the
1482      *  stopword status of the terms is ignored.
1483      */
1484     bool empty_ok;
1485
1486     TermGroup(Term* t1, Term* t2) : empty_ok(false) {
1487         add_term(t1);
1488         add_term(t2);
1489     }
1490
1491   public:
1492     /// Factory function - ensures heap allocation.
1493     static TermGroup* create(Term* t1, Term* t2) {
1494         return new TermGroup(t1, t2);
1495     }
1496
1497     ~TermGroup() {
1498         for (auto&& t : terms) {
1499             delete t;
1500         }
1501     }
1502
1503     /// Add a Term object to this TermGroup object.
1504     void add_term(Term * term) {
1505         terms.push_back(term);
1506     }
1507
1508     /// Set the empty_ok flag.
1509     void set_empty_ok() { empty_ok = true; }
1510
1511     /// Convert to a Xapian::Query * using default_op.
1512     Query * as_group(State *state) const;
1513 };
1514
1515 Query *
1516 TermGroup::as_group(State *state) const
1517 {
1518     const Xapian::Stopper * stopper = state->get_stopper();
1519     size_t stoplist_size = state->stoplist_size();
1520     bool default_op_is_positional = is_positional(state->default_op());
1521 reprocess:
1522     Query::op default_op = state->default_op();
1523     vector<Query> subqs;
1524     subqs.reserve(terms.size());
1525     if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
1526         // Check for multi-word synonyms.
1527         Database db = state->get_database();
1528
1529         string key;
1530         vector<Term*>::size_type begin = 0;
1531         vector<Term*>::size_type i = begin;
1532         while (terms.size() - i > 0) {
1533             size_t longest_match = 0;
1534             // This value is never used, but GCC 4.8 warns with
1535             // -Wmaybe-uninitialized (GCC 5.4 doesn't).
1536             vector<Term*>::size_type longest_match_end = 0;
1537             if (terms.size() - i >= 2) {
1538                 // Greedily try to match as many consecutive words as possible.
1539                 key = terms[i]->name;
1540                 key += ' ';
1541                 key += terms[i + 1]->name;
1542                 TermIterator synkey(db.synonym_keys_begin(key));
1543                 TermIterator synend(db.synonym_keys_end(key));
1544                 if (synkey != synend) {
1545                     longest_match = key.size();
1546                     longest_match_end = i + 2;
1547                     for (auto j = i + 2; j < terms.size(); ++j) {
1548                         key += ' ';
1549                         key += terms[j]->name;
1550                         synkey.skip_to(key);
1551                         if (synkey == synend)
1552                             break;
1553                         const string& found = *synkey;
1554                         if (!startswith(found, key))
1555                             break;
1556                         if (found.size() == key.size()) {
1557                             longest_match = key.size();
1558                             longest_match_end = j + 1;
1559                         }
1560                     }
1561                 }
1562             }
1563             if (longest_match == 0) {
1564                 // No multi-synonym matches at position i.
1565                 if (stopper && (*stopper)(terms[i]->name)) {
1566                     state->add_to_stoplist(terms[i]);
1567                 } else {
1568                     if (default_op_is_positional)
1569                         terms[i]->need_positions();
1570                     subqs.push_back(terms[i]->get_query_with_auto_synonyms());
1571                 }
1572                 begin = ++i;
1573                 continue;
1574             }
1575             i = longest_match_end;
1576             key.resize(longest_match);
1577
1578             vector<Query> subqs2;
1579             for (auto j = begin; j != i; ++j) {
1580                 if (stopper && (*stopper)(terms[j]->name)) {
1581                     state->add_to_stoplist(terms[j]);
1582                 } else {
1583                     if (default_op_is_positional)
1584                         terms[i]->need_positions();
1585                     subqs2.push_back(terms[j]->get_query());
1586                 }
1587             }
1588             Query q_original_terms;
1589             if (default_op_is_positional) {
1590                 q_original_terms = Query(default_op,
1591                                          subqs2.begin(), subqs2.end(),
1592                                          subqs2.size() + 9);
1593             } else {
1594                 q_original_terms = Query(default_op,
1595                                          subqs2.begin(), subqs2.end());
1596             }
1597             subqs2.clear();
1598
1599             // Use the position of the first term for the synonyms.
1600             TermIterator syn = db.synonyms_begin(key);
1601             Query q(Query::OP_SYNONYM,
1602                     SynonymIterator(syn, terms[begin]->pos, &q_original_terms),
1603                     SynonymIterator(db.synonyms_end(key)));
1604             subqs.push_back(q);
1605
1606             begin = i;
1607         }
1608     } else {
1609         vector<Term*>::const_iterator i;
1610         for (i = terms.begin(); i != terms.end(); ++i) {
1611             if (stopper && (*stopper)((*i)->name)) {
1612                 state->add_to_stoplist(*i);
1613             } else {
1614                 if (default_op_is_positional)
1615                     (*i)->need_positions();
1616                 subqs.push_back((*i)->get_query_with_auto_synonyms());
1617             }
1618         }
1619     }
1620
1621     if (!empty_ok && stopper && subqs.empty() &&
1622         stoplist_size < state->stoplist_size()) {
1623         // This group is all stopwords, so roll-back, disable stopper
1624         // temporarily, and reprocess this group.
1625         state->stoplist_resize(stoplist_size);
1626         stopper = NULL;
1627         goto reprocess;
1628     }
1629
1630     Query * q = NULL;
1631     if (!subqs.empty()) {
1632         if (default_op_is_positional) {
1633             q = new Query(default_op, subqs.begin(), subqs.end(),
1634                              subqs.size() + 9);
1635         } else {
1636             q = new Query(default_op, subqs.begin(), subqs.end());
1637         }
1638     }
1639     delete this;
1640     return q;
1641 }
1642
1643 /// Some terms which form a positional sub-query.
1644 class Terms {
1645     vector<Term *> terms;
1646
1647     /** Window size.
1648      *
1649      *  size_t(-1) means don't use positional info (so an OP_AND query gets
1650      *  created).
1651      */
1652     size_t window;
1653
1654     /** Keep track of whether the terms added all have the same list of
1655      *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
1656      *  This works around the limitation that a phrase cannot have multiple
1657      *  components which are "OR" combinations of terms, but is also probably
1658      *  what users expect: i.e., if a user specifies a phrase in a field, and
1659      *  that field maps to multiple prefixes, the user probably wants a phrase
1660      *  returned with all terms having one of those prefixes, rather than a
1661      *  phrase comprised of terms with differing prefixes.
1662      */
1663     bool uniform_prefixes;
1664
1665     /** The list of prefixes of the terms added.
1666      *  This will be NULL if the terms have different prefixes.
1667      */
1668     const vector<string>* prefixes;
1669
1670     Query opwindow_subq(Query::op op,
1671                         const vector<Query>& v,
1672                         Xapian::termcount w) const {
1673         if (op == Query::OP_AND) {
1674             return Query(op, v.begin(), v.end());
1675         }
1676         return Query(op, v.begin(), v.end(), w);
1677     }
1678
1679     /// Convert to a query using the given operator and window size.
1680     Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
1681         if (window == size_t(-1)) op = Query::OP_AND;
1682         Query * q = NULL;
1683         size_t n_terms = terms.size();
1684         Xapian::termcount w = w_delta + terms.size();
1685         if (uniform_prefixes) {
1686             if (prefixes) {
1687                 for (auto&& prefix : *prefixes) {
1688                     vector<Query> subqs;
1689                     subqs.reserve(n_terms);
1690                     for (Term* t : terms) {
1691                         subqs.push_back(Query(t->make_term(prefix), 1, t->pos));
1692                     }
1693                     add_to_query(q, Query::OP_OR, opwindow_subq(op, subqs, w));
1694                 }
1695             }
1696         } else {
1697             vector<Query> subqs;
1698             subqs.reserve(n_terms);
1699             for (Term* t : terms) {
1700                 subqs.push_back(t->get_query());
1701             }
1702             q = new Query(opwindow_subq(op, subqs, w));
1703         }
1704
1705         delete this;
1706         return q;
1707     }
1708
1709     explicit Terms(bool no_pos)
1710         : window(no_pos ? size_t(-1) : 0),
1711           uniform_prefixes(true),
1712           prefixes(NULL) { }
1713
1714   public:
1715     /// Factory function - ensures heap allocation.
1716     static Terms* create(State* state) {
1717         return new Terms(state->flags & QueryParser::FLAG_NO_POSITIONS);
1718     }
1719
1720     ~Terms() {
1721         for (auto&& t : terms) {
1722             delete t;
1723         }
1724     }
1725
1726     /// Add an unstemmed Term object to this Terms object.
1727     void add_positional_term(Term * term) {
1728         const auto& term_prefixes = term->field_info->prefixes;
1729         if (terms.empty()) {
1730             prefixes = &term_prefixes;
1731         } else if (uniform_prefixes && prefixes != &term_prefixes) {
1732             if (*prefixes != term_prefixes)  {
1733                 prefixes = NULL;
1734                 uniform_prefixes = false;
1735             }
1736         }
1737         term->need_positions();
1738         terms.push_back(term);
1739     }
1740
1741     void adjust_window(size_t alternative_window) {
1742         if (alternative_window > window) window = alternative_window;
1743     }
1744
1745     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
1746     Query * as_phrase_query() const {
1747         return as_opwindow_query(Query::OP_PHRASE, 0);
1748     }
1749
1750     /// Convert to a Xapian::Query * using OP_NEAR.
1751     Query * as_near_query() const {
1752         // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
1753         // means a window size of 11.  For more than 2 terms, we just add one
1754         // to the window size for each extra term.
1755         size_t w = window;
1756         if (w == 0) w = 10;
1757         return as_opwindow_query(Query::OP_NEAR, w - 1);
1758     }
1759
1760     /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
1761     Query * as_adj_query() const {
1762         // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
1763         // which means a window size of 11.  For more than 2 terms, we just add
1764         // one to the window size for each extra term.
1765         size_t w = window;
1766         if (w == 0) w = 10;
1767         return as_opwindow_query(Query::OP_PHRASE, w - 1);
1768     }
1769 };
1770
1771 void
1772 Term::as_positional_cjk_term(Terms * terms) const
1773 {
1774     // Add each individual CJK character to the phrase.
1775     string t;
1776     for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) {
1777         Unicode::append_utf8(t, *it);
1778         Term * c = new Term(state, t, field_info, unstemmed, stem, pos);
1779         terms->add_positional_term(c);
1780         t.resize(0);
1781     }
1782
1783     // FIXME: we want to add the n-grams as filters too for efficiency.
1784
1785     delete this;
1786 }
1787
1788 // Helper macro to check for missing arguments to a boolean operator.
1789 #define VET_BOOL_ARGS(A, B, OP_TXT) \
1790     do {\
1791         if (!A || !B) {\
1792             state->error = "Syntax: <expression> " OP_TXT " <expression>";\
1793             yy_parse_failed(yypParser);\
1794             return;\
1795         }\
1796     } while (0)
1797
1798 }
1799
1800 %token_type {Term *}
1801 %token_destructor { delete $$; }
1802
1803 %extra_argument {State * state}
1804
1805 %parse_failure {
1806     // If we've not already set an error message, set a default one.
1807     if (!state->error) state->error = "parse error";
1808 }
1809
1810 %syntax_error {
1811     yy_parse_failed(yypParser);
1812 }
1813
1814 // Operators, grouped in order of increasing precedence:
1815 %nonassoc ERROR.
1816 %left OR.
1817 %left XOR.
1818 %left AND NOT.
1819 %left NEAR ADJ.
1820 %left LOVE HATE HATE_AFTER_AND SYNONYM.
1821
1822 // Destructors for terminal symbols:
1823
1824 // TERM is a query term, including prefix (if any).
1825 %destructor TERM { delete $$; }
1826
1827 // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
1828 // is only separated by whitespace characters.
1829 %destructor GROUP_TERM { delete $$; }
1830
1831 // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
1832 // separated only by one or more phrase generator characters (hyphen and
1833 // apostrophe are common examples - see is_phrase_generator() for the list
1834 // of all punctuation which does this).
1835 %destructor PHR_TERM { delete $$; }
1836
1837 // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
1838 // expanded.
1839 %destructor WILD_TERM { delete $$; }
1840
1841 // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
1842 // we're doing "search as you type".  It expands to something like WILD_TERM
1843 // OR stemmed_form.
1844 %destructor PARTIAL_TERM { delete $$; }
1845
1846 // BOOLEAN_FILTER is a query term with a prefix registered using
1847 // add_boolean_prefix().  It's added to the query using an OP_FILTER operator,
1848 // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
1849 %destructor BOOLEAN_FILTER { delete $$; }
1850
1851 // Grammar rules:
1852
1853 // query - The whole query - just an expr or nothing.
1854
1855 // query non-terminal doesn't need a type, so just give a dummy one.
1856 %type query {int}
1857
1858 query ::= expr(E). {
1859     // Save the parsed query in the State structure so we can return it.
1860     if (E) {
1861         state->query = *E;
1862         delete E;
1863     } else {
1864         state->query = Query();
1865     }
1866 }
1867
1868 query ::= . {
1869     // Handle a query string with no terms in.
1870     state->query = Query();
1871 }
1872
1873 // expr - A query expression.
1874
1875 %type expr {Query *}
1876 %destructor expr { delete $$; }
1877
1878 expr(E) ::= prob_expr(E).
1879
1880 expr(A) ::= bool_arg(A) AND bool_arg(B). {
1881     VET_BOOL_ARGS(A, B, "AND");
1882     *A &= *B;
1883     delete B;
1884 }
1885
1886 expr(A) ::= bool_arg(A) NOT bool_arg(B). {
1887     // 'NOT foo' -> '<alldocuments> NOT foo'
1888     if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
1889         A = new Query("", 1, 0);
1890     }
1891     VET_BOOL_ARGS(A, B, "NOT");
1892     *A &= ~*B;
1893     delete B;
1894 }
1895
1896 expr(A) ::= bool_arg(A) AND NOT bool_arg(B). [NOT] {
1897     VET_BOOL_ARGS(A, B, "AND NOT");
1898     *A &= ~*B;
1899     delete B;
1900 }
1901
1902 expr(A) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND] {
1903     VET_BOOL_ARGS(A, B, "AND");
1904     *A &= ~*B;
1905     delete B;
1906 }
1907
1908 expr(A) ::= bool_arg(A) OR bool_arg(B). {
1909     VET_BOOL_ARGS(A, B, "OR");
1910     *A |= *B;
1911     delete B;
1912 }
1913
1914 expr(A) ::= bool_arg(A) XOR bool_arg(B). {
1915     VET_BOOL_ARGS(A, B, "XOR");
1916     *A ^= *B;
1917     delete B;
1918 }
1919
1920 // bool_arg - an argument to a boolean operator such as AND or OR.
1921
1922 %type bool_arg {Query *}
1923 %destructor bool_arg { delete $$; }
1924
1925 bool_arg(A) ::= expr(A).
1926
1927 bool_arg(A) ::= . [ERROR] {
1928     // Set the argument to NULL, which enables the bool_arg-using rules in
1929     // expr above to report uses of AND, OR, etc which don't have two
1930     // arguments.
1931     A = NULL;
1932 }
1933
1934 // prob_expr - a single compound term, or a prob.
1935
1936 %type prob_expr {Query *}
1937 %destructor prob_expr { delete $$; }
1938
1939 prob_expr(E) ::= prob(P). {
1940     E = P->query;
1941     P->query = NULL;
1942     // Handle any "+ terms".
1943     if (P->love) {
1944         if (P->love->empty()) {
1945             // +<nothing>.
1946             delete E;
1947             E = P->love;
1948         } else if (E) {
1949             swap(E, P->love);
1950             add_to_query(E, Query::OP_AND_MAYBE, P->love);
1951         } else {
1952             E = P->love;
1953         }
1954         P->love = NULL;
1955     }
1956     // Handle any boolean filters.
1957     if (!P->filter.empty()) {
1958         if (E) {
1959             add_to_query(E, Query::OP_FILTER, P->merge_filters());
1960         } else {
1961             // Make the query a boolean one.
1962             E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
1963         }
1964     }
1965     // Handle any "- terms".
1966     if (P->hate && !P->hate->empty()) {
1967         if (!E) {
1968             // Can't just hate!
1969             yy_parse_failed(yypParser);
1970             return;
1971         }
1972         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
1973     }
1974     delete P;
1975 }
1976
1977 prob_expr(E) ::= term(E).
1978
1979 // prob - a sub-expression consisting of stop_terms, "+" terms, "-" terms,
1980 // boolean filters, and/or ranges.
1981 //
1982 // Note: stop_term can also be several other things other than a simple term!
1983
1984 %type prob {ProbQuery *}
1985 %destructor prob { delete $$; }
1986
1987 prob(P) ::= RANGE(R). {
1988     string grouping = R->name;
1989     const Query & range = R->as_range_query();
1990     P = new ProbQuery; /*P-overwrites-R*/
1991     P->add_filter_range(grouping, range);
1992 }
1993
1994 prob(P) ::= stop_prob(P) RANGE(R). {
1995     string grouping = R->name;
1996     const Query & range = R->as_range_query();
1997     P->append_filter_range(grouping, range);
1998 }
1999
2000 prob(P) ::= stop_term(T) stop_term(U). {
2001     P = new ProbQuery(T); /*P-overwrites-T*/
2002     if (U) {
2003         Query::op op = state->default_op();
2004         if (P->query && is_positional(op)) {
2005             // If default_op is OP_NEAR or OP_PHRASE, set the window size to
2006             // 11 for the first pair of terms and it will automatically grow
2007             // by one for each subsequent term.
2008             Query * subqs[2] = { P->query, U };
2009             *(P->query) = Query(op, subqs, subqs + 2, 11);
2010             delete U;
2011         } else {
2012             add_to_query(P->query, op, U);
2013         }
2014     }
2015 }
2016
2017 prob(P) ::= prob(P) stop_term(T). {
2018     // If T is a stopword, there's nothing to do here.
2019     if (T) add_to_query(P->query, state->default_op(), T);
2020 }
2021
2022 prob(P) ::= LOVE term(T). {
2023     P = new ProbQuery;
2024     if (state->default_op() == Query::OP_AND) {
2025         P->query = T;
2026     } else {
2027         P->love = T;
2028     }
2029 }
2030
2031 prob(P) ::= stop_prob(P) LOVE term(T). {
2032     if (state->default_op() == Query::OP_AND) {
2033         /* The default op is AND, so we just put loved terms into the query
2034          * (in this case the only effect of love is to ignore the stopword
2035          * list). */
2036         add_to_query(P->query, Query::OP_AND, T);
2037     } else {
2038         add_to_query(P->love, Query::OP_AND, T);
2039     }
2040 }
2041
2042 prob(P) ::= HATE term(T). {
2043     P = new ProbQuery;
2044     P->hate = T;
2045 }
2046
2047 prob(P) ::= stop_prob(P) HATE term(T). {
2048     add_to_query(P->hate, Query::OP_OR, T);
2049 }
2050
2051 prob(P) ::= HATE BOOLEAN_FILTER(T). {
2052     P = new ProbQuery;
2053     P->hate = new Query(T->get_query());
2054     delete T;
2055 }
2056
2057 prob(P) ::= stop_prob(P) HATE BOOLEAN_FILTER(T). {
2058     add_to_query(P->hate, Query::OP_OR, T->get_query());
2059     delete T;
2060 }
2061
2062 prob(P) ::= BOOLEAN_FILTER(T). {
2063     P = new ProbQuery;
2064     P->add_filter(T->get_grouping(), T->get_query());
2065     delete T;
2066 }
2067
2068 prob(P) ::= stop_prob(P) BOOLEAN_FILTER(T). {
2069     P->append_filter(T->get_grouping(), T->get_query());
2070     delete T;
2071 }
2072
2073 prob(P) ::= LOVE BOOLEAN_FILTER(T). {
2074     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2075     P = new ProbQuery;
2076     P->filter[T->get_grouping()] = T->get_query();
2077     delete T;
2078 }
2079
2080 prob(P) ::= stop_prob(P) LOVE BOOLEAN_FILTER(T). {
2081     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
2082     // We OR filters with the same prefix...
2083     Query & q = P->filter[T->get_grouping()];
2084     q |= T->get_query();
2085     delete T;
2086 }
2087
2088 // stop_prob - A prob or a stop_term.
2089
2090 %type stop_prob {ProbQuery *}
2091 %destructor stop_prob { delete $$; }
2092
2093 stop_prob(P) ::= prob(P).
2094
2095 stop_prob(P) ::= stop_term(T). {
2096     P = new ProbQuery(T); /*P-overwrites-T*/
2097 }
2098
2099 // stop_term - A term which should be checked against the stopword list,
2100 // or a compound_term.
2101 //
2102 // If a term is loved, hated, or in a phrase, we don't want to consult the
2103 // stopword list, so stop_term isn't used there (instead term is).
2104
2105 %type stop_term {Query *}
2106 %destructor stop_term { delete $$; }
2107
2108 stop_term(T) ::= TERM(U). {
2109     if (state->is_stopword(U)) {
2110         T = NULL;
2111         state->add_to_stoplist(U);
2112     } else {
2113         T = new Query(U->get_query_with_auto_synonyms());
2114     }
2115     delete U;
2116 }
2117
2118 stop_term(T) ::= compound_term(T).
2119
2120 // term - A term or a compound_term.
2121
2122 %type term {Query *}
2123 %destructor term { delete $$; }
2124
2125 term(T) ::= TERM(U). {
2126     T = new Query(U->get_query_with_auto_synonyms());
2127     delete U;
2128 }
2129
2130 term(T) ::= compound_term(T).
2131
2132 // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
2133 // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
2134 // or without prefix).
2135
2136 %type compound_term {Query *}
2137 %destructor compound_term { delete $$; }
2138
2139 compound_term(T) ::= WILD_TERM(U).
2140         { T = U->as_wildcarded_query(state); /*T-overwrites-U*/ }
2141
2142 compound_term(T) ::= PARTIAL_TERM(U).
2143         { T = U->as_partial_query(state); /*T-overwrites-U*/ }
2144
2145 compound_term(T) ::= QUOTE phrase(P) QUOTE.
2146         { T = P->as_phrase_query(); }
2147
2148 compound_term(T) ::= phrased_term(P).
2149         { T = P->as_phrase_query(); /*T-overwrites-P*/ }
2150
2151 compound_term(T) ::= group(P).
2152         { T = P->as_group(state); /*T-overwrites-P*/ }
2153
2154 compound_term(T) ::= near_expr(P).
2155         { T = P->as_near_query(); /*T-overwrites-P*/ }
2156
2157 compound_term(T) ::= adj_expr(P).
2158         { T = P->as_adj_query(); /*T-overwrites-P*/ }
2159
2160 compound_term(T) ::= BRA expr(E) KET.
2161         { T = E; }
2162
2163 compound_term(T) ::= SYNONYM TERM(U). {
2164     T = new Query(U->get_query_with_synonyms());
2165     delete U;
2166 }
2167
2168 compound_term(T) ::= CJKTERM(U). {
2169     { T = U->as_cjk_query(); /*T-overwrites-U*/ }
2170 }
2171
2172 // phrase - The "inside the quotes" part of a double-quoted phrase.
2173
2174 %type phrase {Terms *}
2175
2176 %destructor phrase { delete $$; }
2177
2178 phrase(P) ::= TERM(T). {
2179     P = Terms::create(state);
2180     P->add_positional_term(T);
2181 }
2182
2183 phrase(P) ::= CJKTERM(T). {
2184     P = Terms::create(state);
2185     T->as_positional_cjk_term(P);
2186 }
2187
2188 phrase(P) ::= phrase(P) TERM(T). {
2189     P->add_positional_term(T);
2190 }
2191
2192 phrase(P) ::= phrase(P) CJKTERM(T). {
2193     T->as_positional_cjk_term(P);
2194 }
2195
2196 // phrased_term - A phrased term works like a single term, but is actually
2197 // 2 or more terms linked together into a phrase by punctuation.  There must be
2198 // at least 2 terms in order to be able to have punctuation between the terms!
2199
2200 %type phrased_term {Terms *}
2201 %destructor phrased_term { delete $$; }
2202
2203 phrased_term(P) ::= TERM(T) PHR_TERM(U). {
2204     P = Terms::create(state);
2205     P->add_positional_term(T);
2206     P->add_positional_term(U);
2207 }
2208
2209 phrased_term(P) ::= phrased_term(P) PHR_TERM(T). {
2210     P->add_positional_term(T);
2211 }
2212
2213 // group - A group of terms separated only by whitespace - candidates for
2214 // multi-term synonyms.
2215
2216 %type group {TermGroup *}
2217 %destructor group { delete $$; }
2218
2219 group(P) ::= TERM(T) GROUP_TERM(U). {
2220     P = TermGroup::create(T, U); /*P-overwrites-T*/
2221 }
2222
2223 group(P) ::= group(P) GROUP_TERM(T). {
2224     P->add_term(T);
2225 }
2226
2227 group(P) ::= group(P) EMPTY_GROUP_OK. {
2228     P->set_empty_ok();
2229 }
2230
2231 // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
2232 // terms in order for there to be any NEAR operators!
2233
2234 %type near_expr {Terms *}
2235 %destructor near_expr { delete $$; }
2236
2237 near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
2238     P = Terms::create(state);
2239     P->add_positional_term(T);
2240     P->add_positional_term(U);
2241     if (N) {
2242         P->adjust_window(N->get_termpos());
2243         delete N;
2244     }
2245 }
2246
2247 near_expr(P) ::= near_expr(P) NEAR(N) TERM(T). {
2248     P->add_positional_term(T);
2249     if (N) {
2250         P->adjust_window(N->get_termpos());
2251         delete N;
2252     }
2253 }
2254
2255 // adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
2256 // terms in order for there to be any ADJ operators!
2257
2258 %type adj_expr {Terms *}
2259 %destructor adj_expr { delete $$; }
2260
2261 adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
2262     P = Terms::create(state);
2263     P->add_positional_term(T);
2264     P->add_positional_term(U);
2265     if (N) {
2266         P->adjust_window(N->get_termpos());
2267         delete N;
2268     }
2269 }
2270
2271 adj_expr(P) ::= adj_expr(P) ADJ(N) TERM(T). {
2272     P->add_positional_term(T);
2273     if (N) {
2274         P->adjust_window(N->get_termpos());
2275         delete N;
2276     }
2277 }
2278
2279 // Select lemon syntax highlighting in vim editor: vim: syntax=lemon