xapian-core/queryparser/termgenerator_internal.cc

   1 /** @file
   2  * @brief TermGenerator class internals
   3  */
   4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017,2018,2019,2020 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "termgenerator_internal.h"
  24
  25 #include "api/omenquireinternal.h"
  26 #include "api/queryinternal.h"
  27
  28 #include <xapian/document.h>
  29 #include <xapian/queryparser.h>
  30 #include <xapian/stem.h>
  31 #include <xapian/unicode.h>
  32
  33 #include "stringutils.h"
  34
  35 #include <algorithm>
  36 #include <cmath>
  37 #include <deque>
  38 #include <limits>
  39 #include <list>
  40 #include <string>
  41 #include <unordered_map>
  42 #include <vector>
  43
  44 #include "cjk-tokenizer.h"
  45
  46 using namespace std;
  47
  48 namespace Xapian {
  49
  50 static inline bool
  51 U_isupper(unsigned ch)
  52 {
  53     return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
  54 }
  55
  56 static inline unsigned
  57 check_wordchar(unsigned ch)
  58 {
  59     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
  60     return 0;
  61 }
  62
  63 static inline bool
  64 should_stem(const std::string & term)
  65 {
  66     const unsigned int SHOULD_STEM_MASK =
  67         (1 << Unicode::LOWERCASE_LETTER) |
  68         (1 << Unicode::TITLECASE_LETTER) |
  69         (1 << Unicode::MODIFIER_LETTER) |
  70         (1 << Unicode::OTHER_LETTER);
  71     Utf8Iterator u(term);
  72     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
  73 }
  74
  75 /** Value representing "ignore this" when returned by check_infix() or
  76  *  check_infix_digit().
  77  */
  78 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
  79
  80 static inline unsigned
  81 check_infix(unsigned ch)
  82 {
  83     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
  84         // Unicode includes all these except '&' in its word boundary rules,
  85         // as well as 0x2019 (which we handle below) and ':' (for Swedish
  86         // apparently, but we ignore this for now as it's problematic in
  87         // real world cases).
  88         return ch;
  89     }
  90     // 0x2019 is Unicode apostrophe and single closing quote.
  91     // 0x201b is Unicode single opening quote with the tail rising.
  92     if (ch == 0x2019 || ch == 0x201b) return '\'';
  93     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
  94         return UNICODE_IGNORE;
  95     return 0;
  96 }
  97
  98 static inline unsigned
  99 check_infix_digit(unsigned ch)
 100 {
 101     // This list of characters comes from Unicode's word identifying algorithm.
 102     switch (ch) {
 103         case ',':
 104         case '.':
 105         case ';':
 106         case 0x037e: // GREEK QUESTION MARK
 107         case 0x0589: // ARMENIAN FULL STOP
 108         case 0x060D: // ARABIC DATE SEPARATOR
 109         case 0x07F8: // NKO COMMA
 110         case 0x2044: // FRACTION SLASH
 111         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 112         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 113         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 114             return ch;
 115     }
 116     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 117         return UNICODE_IGNORE;
 118     return 0;
 119 }
 120
 121 static inline bool
 122 is_digit(unsigned ch) {
 123     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
 124 }
 125
 126 static inline unsigned
 127 check_suffix(unsigned ch)
 128 {
 129     if (ch == '+' || ch == '#') return ch;
 130     // FIXME: what about '-'?
 131     return 0;
 132 }
 133
 134 /** Templated framework for processing terms.
 135  *
 136  *  Calls action(term, positional) for each term to add, where term is a
 137  *  std::string holding the term, and positional is a bool indicating
 138  *  if this term carries positional information.
 139  */
 140 template<typename ACTION>
 141 static void
 142 parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
 143 {
 144     while (true) {
 145         // Advance to the start of the next term.
 146         unsigned ch;
 147         while (true) {
 148             if (itor == Utf8Iterator()) return;
 149             ch = check_wordchar(*itor);
 150             if (ch) break;
 151             ++itor;
 152         }
 153
 154         string term;
 155         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 156         // Don't worry if there's a trailing '.' or not.
 157         if (U_isupper(*itor)) {
 158             const Utf8Iterator end;
 159             Utf8Iterator p = itor;
 160             do {
 161                 Unicode::append_utf8(term, Unicode::tolower(*p++));
 162             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 163             // One letter does not make an acronym!  If we handled a single
 164             // uppercase letter here, we wouldn't catch M&S below.
 165             if (term.size() > 1) {
 166                 // Check there's not a (lower case) letter or digit
 167                 // immediately after it.
 168                 if (p == end || !Unicode::is_wordchar(*p)) {
 169                     itor = p;
 170                     goto endofterm;
 171                 }
 172             }
 173             term.resize(0);
 174         }
 175
 176         while (true) {
 177             if (cjk_ngram &&
 178                 CJK::codepoint_is_cjk(*itor) &&
 179                 Unicode::is_wordchar(*itor)) {
 180                 CJKTokenIterator tk(itor);
 181                 while (tk != CJKTokenIterator()) {
 182                     const string& cjk_token = *tk;
 183                     if (!action(cjk_token, with_positions && tk.unigram(),
 184                                 tk.get_utf8iterator()))
 185                         return;
 186                     ++tk;
 187                 }
 188                 // Update itor to end of CJK text span.
 189                 itor = tk.get_utf8iterator();
 190                 while (true) {
 191                     if (itor == Utf8Iterator()) return;
 192                     ch = check_wordchar(*itor);
 193                     if (ch) break;
 194                     ++itor;
 195                 }
 196                 continue;
 197             }
 198             unsigned prevch;
 199             do {
 200                 Unicode::append_utf8(term, ch);
 201                 prevch = ch;
 202                 if (++itor == Utf8Iterator() ||
 203                     (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
 204                     goto endofterm;
 205                 ch = check_wordchar(*itor);
 206             } while (ch);
 207
 208             Utf8Iterator next(itor);
 209             ++next;
 210             if (next == Utf8Iterator()) break;
 211             unsigned nextch = check_wordchar(*next);
 212             if (!nextch) break;
 213             unsigned infix_ch = *itor;
 214             if (is_digit(prevch) && is_digit(*next)) {
 215                 infix_ch = check_infix_digit(infix_ch);
 216             } else {
 217                 // Handle things like '&' in AT&T, apostrophes, etc.
 218                 infix_ch = check_infix(infix_ch);
 219             }
 220             if (!infix_ch) break;
 221             if (infix_ch != UNICODE_IGNORE)
 222                 Unicode::append_utf8(term, infix_ch);
 223             ch = nextch;
 224             itor = next;
 225         }
 226
 227         {
 228             size_t len = term.size();
 229             unsigned count = 0;
 230             while ((ch = check_suffix(*itor))) {
 231                 if (++count > 3) {
 232                     term.resize(len);
 233                     break;
 234                 }
 235                 Unicode::append_utf8(term, ch);
 236                 if (++itor == Utf8Iterator()) goto endofterm;
 237             }
 238             // Don't index fish+chips as fish+ chips.
 239             if (Unicode::is_wordchar(*itor))
 240                 term.resize(len);
 241         }
 242
 243 endofterm:
 244         if (!action(term, with_positions, itor))
 245             return;
 246     }
 247 }
 248
 249 void
 250 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
 251                                     const string & prefix, bool with_positions)
 252 {
 253     bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
 254
 255     stop_strategy current_stop_mode;
 256     if (!stopper.get()) {
 257         current_stop_mode = TermGenerator::STOP_NONE;
 258     } else {
 259         current_stop_mode = stop_mode;
 260     }
 261
 262     parse_terms(itor, cjk_ngram, with_positions,
 263         [=
 264 #if __cplusplus >= 201907L
 265 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
 266 // don't allow `this` here.
 267         , this
 268 #endif
 269         ](const string & term, bool positional, const Utf8Iterator &) {
 270             if (term.size() > max_word_length) return true;
 271
 272             if (current_stop_mode == TermGenerator::STOP_ALL &&
 273                 (*stopper)(term)) {
 274                 return true;
 275             }
 276
 277             if (strategy == TermGenerator::STEM_SOME ||
 278                 strategy == TermGenerator::STEM_NONE ||
 279                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
 280                 if (positional) {
 281                     doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
 282                 } else {
 283                     doc.add_term(prefix + term, wdf_inc);
 284                 }
 285             }
 286
 287             // MSVC seems to need "this->" on member variables in this
 288             // situation.
 289             if ((this->flags & FLAG_SPELLING) && prefix.empty())
 290                 db.add_spelling(term);
 291
 292             if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
 293                 return true;
 294
 295             if (strategy == TermGenerator::STEM_SOME ||
 296                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
 297                 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
 298                     (*stopper)(term))
 299                     return true;
 300
 301                 // Note, this uses the lowercased term, but that's OK as we
 302                 // only want to avoid stemming terms starting with a digit.
 303                 if (!should_stem(term)) return true;
 304             }
 305
 306             // Add stemmed form without positional information.
 307             const string& stem = stemmer(term);
 308             if (rare(stem.empty())) return true;
 309             string stemmed_term;
 310             if (strategy != TermGenerator::STEM_ALL) {
 311                 stemmed_term += "Z";
 312             }
 313             stemmed_term += prefix;
 314             stemmed_term += stem;
 315             if (strategy != TermGenerator::STEM_SOME && with_positions) {
 316                 if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
 317                 doc.add_posting(stemmed_term, cur_pos, wdf_inc);
 318             } else {
 319                 doc.add_term(stemmed_term, wdf_inc);
 320             }
 321             return true;
 322         });
 323 }
 324
 325 struct Sniplet {
 326     double* relevance;
 327
 328     size_t term_end;
 329
 330     size_t highlight;
 331
 332     Sniplet(double* r, size_t t, size_t h)
 333         : relevance(r), term_end(t), highlight(h) { }
 334 };
 335
 336 class SnipPipe {
 337     deque<Sniplet> pipe;
 338     deque<Sniplet> best_pipe;
 339
 340     // Requested length for snippet.
 341     size_t length;
 342
 343     // Position in text of start of current pipe contents.
 344     size_t begin = 0;
 345
 346     // Rolling sum of the current pipe contents.
 347     double sum = 0;
 348
 349     size_t phrase_len = 0;
 350
 351   public:
 352     size_t best_begin = 0;
 353
 354     size_t best_end = 0;
 355
 356     double best_sum = 0;
 357
 358     // Add one to length to allow for inter-word space.
 359     // FIXME: We ought to correctly allow for multiple spaces.
 360     explicit SnipPipe(size_t length_) : length(length_ + 1) { }
 361
 362     bool pump(double* r, size_t t, size_t h, unsigned flags);
 363
 364     void done();
 365
 366     bool drain(const string & input,
 367                const string & hi_start,
 368                const string & hi_end,
 369                const string & omit,
 370                string & output);
 371 };
 372
 373 #define DECAY 2.0
 374
 375 inline bool
 376 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
 377 {
 378     if (h > 1) {
 379         if (pipe.size() >= h - 1) {
 380             // The final term of a phrase is entering the window.  Peg the
 381             // phrase's relevance onto the first term of the phrase, so it'll
 382             // be removed from `sum` when the phrase starts to leave the
 383             // window.
 384             auto & phrase_start = pipe[pipe.size() - (h - 1)];
 385             if (phrase_start.relevance) {
 386                 *phrase_start.relevance *= DECAY;
 387                 sum -= *phrase_start.relevance;
 388             }
 389             sum += *r;
 390             phrase_start.relevance = r;
 391             phrase_start.highlight = h;
 392             *r /= DECAY;
 393         }
 394         r = NULL;
 395         h = 0;
 396     }
 397     pipe.emplace_back(r, t, h);
 398     if (r) {
 399         sum += *r;
 400         *r /= DECAY;
 401     }
 402
 403     // If necessary, discard words from the start of the pipe until it has the
 404     // desired length.
 405     // FIXME: Also shrink the window past words with relevance < 0?
 406     while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
 407         const Sniplet& word = pipe.front();
 408         if (word.relevance) {
 409             *word.relevance *= DECAY;
 410             sum -= *word.relevance;
 411         }
 412         begin = word.term_end;
 413         if (best_end >= begin)
 414             best_pipe.push_back(word);
 415         pipe.pop_front();
 416         // E.g. can happen if the current term is longer than the requested
 417         // length!
 418         if (rare(pipe.empty())) break;
 419     }
 420
 421     // Using > here doesn't work well, as we don't extend a snippet over terms
 422     // with 0 weight.
 423     if (sum >= best_sum) {
 424         // Discard any part of `best_pipe` which is before `begin`.
 425         if (begin >= best_end) {
 426             best_pipe.clear();
 427         } else {
 428             while (!best_pipe.empty() &&
 429                    best_pipe.front().term_end <= begin) {
 430                 best_pipe.pop_front();
 431             }
 432         }
 433         best_sum = sum;
 434         best_begin = begin;
 435         best_end = t;
 436     } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
 437         if (best_sum > 0 && best_end < begin) {
 438             // We found something, and we aren't still looking near it.
 439             // FIXME: Benchmark this and adjust if necessary.
 440             return false;
 441         }
 442     }
 443     return true;
 444 }
 445
 446 inline void
 447 SnipPipe::done()
 448 {
 449     // Discard any part of `pipe` which is after `best_end`.
 450     if (begin >= best_end) {
 451         pipe.clear();
 452     } else {
 453         // We should never empty the pipe (as that case should be handled
 454         // above).
 455         while (rare(!pipe.empty()) &&
 456                pipe.back().term_end > best_end) {
 457             pipe.pop_back();
 458         }
 459     }
 460 }
 461
 462 // Check if a non-word character is should be included at the start of the
 463 // snippet.  We want to include certain leading non-word characters, but not
 464 // others.
 465 static inline bool
 466 snippet_check_leading_nonwordchar(unsigned ch) {
 467     if (Unicode::is_currency(ch) ||
 468         Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
 469         Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
 470         return true;
 471     }
 472     switch (ch) {
 473         case '"':
 474         case '#':
 475         case '%':
 476         case '&':
 477         case '\'':
 478         case '+':
 479         case '-':
 480         case '/':
 481         case '<':
 482         case '@':
 483         case '\\':
 484         case '`':
 485         case '~':
 486         case 0x00A1: // INVERTED EXCLAMATION MARK
 487         case 0x00A7: // SECTION SIGN
 488         case 0x00BF: // INVERTED QUESTION MARK
 489             return true;
 490     }
 491     return false;
 492 }
 493
 494 // Check if a non-word character is should be included at the end of the
 495 // snippet.  We want to include certain trailing non-word characters, but not
 496 // others.
 497 static inline bool
 498 snippet_check_trailing_nonwordchar(unsigned ch) {
 499     if (Unicode::is_currency(ch) ||
 500         Unicode::get_category(ch) == Unicode::CLOSE_PUNCTUATION ||
 501         Unicode::get_category(ch) == Unicode::FINAL_QUOTE_PUNCTUATION) {
 502         return true;
 503     }
 504     switch (ch) {
 505         case '"':
 506         case '%':
 507         case '\'':
 508         case '+':
 509         case '-':
 510         case '/':
 511         case '>':
 512         case '@':
 513         case '\\':
 514         case '`':
 515         case '~':
 516             return true;
 517     }
 518     return false;
 519 }
 520
 521 static inline void
 522 append_escaping_xml(const char* p, const char* end, string& output)
 523 {
 524     while (p != end) {
 525         char ch = *p++;
 526         switch (ch) {
 527             case '&':
 528                 output += "&amp;";
 529                 break;
 530             case '<':
 531                 output += "&lt;";
 532                 break;
 533             case '>':
 534                 output += "&gt;";
 535                 break;
 536             default:
 537                 output += ch;
 538         }
 539     }
 540 }
 541
 542 inline bool
 543 SnipPipe::drain(const string & input,
 544                 const string & hi_start,
 545                 const string & hi_end,
 546                 const string & omit,
 547                 string & output)
 548 {
 549     if (best_pipe.empty() && !pipe.empty()) {
 550         swap(best_pipe, pipe);
 551     }
 552
 553     if (best_pipe.empty()) {
 554         size_t tail_len = input.size() - best_end;
 555         if (tail_len == 0) return false;
 556
 557         // See if this is the end of a sentence.
 558         // FIXME: This is quite simplistic - look at the Unicode rules:
 559         // https://unicode.org/reports/tr29/#Sentence_Boundaries
 560         bool sentence_end = false;
 561         Utf8Iterator i(input.data() + best_end, tail_len);
 562         while (i != Utf8Iterator()) {
 563             unsigned ch = *i;
 564             if (sentence_end && Unicode::is_whitespace(ch)) break;
 565
 566             // Allow "...", "!!", "!?!", etc...
 567             sentence_end = (ch == '.' || ch == '?' || ch == '!');
 568
 569             if (Unicode::is_wordchar(ch)) break;
 570             ++i;
 571         }
 572
 573         if (sentence_end) {
 574             // Include end of sentence punctuation.
 575             append_escaping_xml(input.data() + best_end, i.raw(), output);
 576             return false;
 577         }
 578
 579         // Include trailing punctuation which includes meaning or context.
 580         i.assign(input.data() + best_end, tail_len);
 581         int trailing_punc = 0;
 582         while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
 583             // But limit how much trailing punctuation we include.
 584             if (++trailing_punc > 4) {
 585                 trailing_punc = 0;
 586                 break;
 587             }
 588             ++i;
 589         }
 590         if (trailing_punc) {
 591             append_escaping_xml(input.data() + best_end, i.raw(), output);
 592             if (i == Utf8Iterator()) return false;
 593         }
 594
 595         // Append "..." or equivalent as this doesn't seem to be the start
 596         // of a sentence.
 597         output += omit;
 598
 599         return false;
 600     }
 601
 602     const Sniplet & word = best_pipe.front();
 603
 604     if (output.empty()) {
 605         // Start of the snippet.
 606         enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
 607
 608         Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
 609         while (i != Utf8Iterator()) {
 610             unsigned ch = *i;
 611             switch (sentence_boundary) {
 612                 case NO:
 613                     if (ch == '.' || ch == '?' || ch == '!') {
 614                         sentence_boundary = PUNC;
 615                     }
 616                     break;
 617                 case PUNC:
 618                     if (Unicode::is_whitespace(ch)) {
 619                         sentence_boundary = YES;
 620                     } else if (ch == '.' || ch == '?' || ch == '!') {
 621                         // Allow "...", "!!", "!?!", etc...
 622                     } else {
 623                         sentence_boundary = NO;
 624                     }
 625                     break;
 626                 case YES:
 627                     break;
 628             }
 629
 630             // Start the snippet at the start of the first word, but include
 631             // certain punctuation too.
 632             if (Unicode::is_wordchar(ch)) {
 633                 // But limit how much leading punctuation we include.
 634                 size_t word_begin = i.raw() - input.data();
 635                 if (word_begin - best_begin > 4) {
 636                     best_begin = word_begin;
 637                 }
 638                 break;
 639             }
 640             ++i;
 641             if (!snippet_check_leading_nonwordchar(ch)) {
 642                 best_begin = i.raw() - input.data();
 643             }
 644         }
 645
 646         // Add "..." or equivalent if this doesn't seem to be the start of a
 647         // sentence.
 648         if (sentence_boundary != YES) {
 649             output += omit;
 650         }
 651     }
 652
 653     if (word.highlight) {
 654         // Don't include inter-word characters in the highlight.
 655         Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
 656         while (i != Utf8Iterator()) {
 657             unsigned ch = *i;
 658             if (Unicode::is_wordchar(ch)) {
 659                 append_escaping_xml(input.data() + best_begin, i.raw(), output);
 660                 best_begin = i.raw() - input.data();
 661                 break;
 662             }
 663             ++i;
 664         }
 665     }
 666
 667     if (!phrase_len) {
 668         phrase_len = word.highlight;
 669         if (phrase_len) output += hi_start;
 670     }
 671
 672     const char* p = input.data();
 673     append_escaping_xml(p + best_begin, p + word.term_end, output);
 674     best_begin = word.term_end;
 675
 676     if (phrase_len && --phrase_len == 0) output += hi_end;
 677
 678     best_pipe.pop_front();
 679     return true;
 680 }
 681
 682 static void
 683 check_query(const Xapian::Query & query,
 684             list<vector<string>> & exact_phrases,
 685             unordered_map<string, double> & loose_terms,
 686             list<string> & wildcards,
 687             size_t & longest_phrase)
 688 {
 689     // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
 690     size_t n_subqs = query.get_num_subqueries();
 691     Xapian::Query::op op = query.get_type();
 692     if (op == query.LEAF_TERM) {
 693         const Xapian::Internal::QueryTerm & qt =
 694             *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
 695         loose_terms.insert(make_pair(qt.get_term(), 0));
 696     } else if (op == query.OP_WILDCARD) {
 697         const Xapian::Internal::QueryWildcard & qw =
 698             *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
 699         wildcards.push_back(qw.get_pattern());
 700     } else if (op == query.OP_PHRASE) {
 701         const Xapian::Internal::QueryPhrase & phrase =
 702             *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
 703         if (phrase.get_window() == n_subqs) {
 704             // Tight phrase.
 705             for (size_t i = 0; i != n_subqs; ++i) {
 706                 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
 707                     goto non_term_subquery;
 708             }
 709
 710             // Tight phrase of terms.
 711             exact_phrases.push_back(vector<string>());
 712             vector<string> & terms = exact_phrases.back();
 713             terms.reserve(n_subqs);
 714             for (size_t i = 0; i != n_subqs; ++i) {
 715                 Xapian::Query q = query.get_subquery(i);
 716                 const Xapian::Internal::QueryTerm & qt =
 717                     *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
 718                 terms.push_back(qt.get_term());
 719             }
 720             if (n_subqs > longest_phrase) longest_phrase = n_subqs;
 721             return;
 722         }
 723     }
 724 non_term_subquery:
 725     for (size_t i = 0; i != n_subqs; ++i)
 726         check_query(query.get_subquery(i), exact_phrases, loose_terms,
 727                     wildcards, longest_phrase);
 728 }
 729
 730 static double*
 731 check_term(unordered_map<string, double> & loose_terms,
 732            const Xapian::Weight::Internal * stats,
 733            const string & term,
 734            double max_tw)
 735 {
 736     auto it = loose_terms.find(term);
 737     if (it == loose_terms.end()) return NULL;
 738
 739     if (it->second == 0.0) {
 740         double relevance;
 741         if (!stats->get_termweight(term, relevance)) {
 742             // FIXME: Assert?
 743             loose_terms.erase(it);
 744             return NULL;
 745         }
 746
 747         it->second = relevance + max_tw;
 748     }
 749     return &it->second;
 750 }
 751
 752 string
 753 MSet::Internal::snippet(const string & text,
 754                         size_t length,
 755                         const Xapian::Stem & stemmer,
 756                         unsigned flags,
 757                         const string & hi_start,
 758                         const string & hi_end,
 759                         const string & omit) const
 760 {
 761     if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
 762         // Too easy!
 763         return text;
 764     }
 765
 766     bool cjk_ngram = (flags & MSet::SNIPPET_CJK_NGRAM);
 767     if (!cjk_ngram) {
 768         cjk_ngram = CJK::is_cjk_enabled();
 769     }
 770
 771     size_t term_start = 0;
 772     double min_tw = 0, max_tw = 0;
 773     if (stats) stats->get_max_termweight(min_tw, max_tw);
 774     if (max_tw == 0.0) {
 775         max_tw = 1.0;
 776     } else {
 777         // Scale up by (1 + 1/64) so that highlighting works better for terms
 778         // with termweight 0 (which happens for terms not in the database, and
 779         // also with some weighting schemes for terms which occur in almost all
 780         // documents.
 781         max_tw *= 1.015625;
 782     }
 783
 784     Xapian::Query query;
 785     if (enquire.get()) {
 786         query = enquire->query;
 787     }
 788     SnipPipe snip(length);
 789
 790     list<vector<string>> exact_phrases;
 791     unordered_map<string, double> loose_terms;
 792     list<string> wildcards;
 793     size_t longest_phrase = 0;
 794     check_query(query, exact_phrases, loose_terms,
 795                 wildcards, longest_phrase);
 796
 797     vector<double> exact_phrases_relevance;
 798     exact_phrases_relevance.reserve(exact_phrases.size());
 799     for (auto&& terms : exact_phrases) {
 800         // FIXME: What relevance to use?
 801         exact_phrases_relevance.push_back(max_tw * terms.size());
 802     }
 803
 804     vector<double> wildcards_relevance;
 805     wildcards_relevance.reserve(exact_phrases.size());
 806     for (auto&& pattern : wildcards) {
 807         // FIXME: What relevance to use?
 808         (void)pattern;
 809         wildcards_relevance.push_back(max_tw + min_tw);
 810     }
 811
 812     // Background relevance is the same for a given MSet, so cache it
 813     // between calls to MSet::snippet() on the same object.
 814     unordered_map<string, double>& background = snippet_bg_relevance;
 815
 816     vector<string> phrase;
 817     if (longest_phrase) phrase.resize(longest_phrase - 1);
 818     size_t phrase_next = 0;
 819     bool matchfound = false;
 820     parse_terms(Utf8Iterator(text), cjk_ngram, true,
 821         [&](const string & term, bool positional, const Utf8Iterator & it) {
 822             // FIXME: Don't hardcode this here.
 823             const size_t max_word_length = 64;
 824
 825             if (!positional) return true;
 826             if (term.size() > max_word_length) return true;
 827
 828             // We get segments with any "inter-word" characters in front of
 829             // each word, e.g.:
 830             // [The][ cat][ sat][ on][ the][ mat]
 831             size_t term_end = text.size() - it.left();
 832
 833             double* relevance = 0;
 834             size_t highlight = 0;
 835             if (stats) {
 836                 size_t i = 0;
 837                 for (auto&& terms : exact_phrases) {
 838                     if (term == terms.back()) {
 839                         size_t n = terms.size() - 1;
 840                         bool match = true;
 841                         while (n--) {
 842                             if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
 843                                 match = false;
 844                                 break;
 845                             }
 846                         }
 847                         if (match) {
 848                             // FIXME: Sort phrases, highest score first!
 849                             relevance = &exact_phrases_relevance[i];
 850                             highlight = terms.size();
 851                             goto relevance_done;
 852                         }
 853                     }
 854                     ++i;
 855                 }
 856
 857                 relevance = check_term(loose_terms, stats, term, max_tw);
 858                 if (relevance) {
 859                     // Matched unstemmed term.
 860                     highlight = 1;
 861                     goto relevance_done;
 862                 }
 863
 864                 string stem = "Z";
 865                 stem += stemmer(term);
 866                 relevance = check_term(loose_terms, stats, stem, max_tw);
 867                 if (relevance) {
 868                     // Matched stemmed term.
 869                     highlight = 1;
 870                     goto relevance_done;
 871                 }
 872
 873                 // Check wildcards.
 874                 // FIXME: Sort wildcards, shortest pattern first or something?
 875                 i = 0;
 876                 for (auto&& pattern : wildcards) {
 877                     if (startswith(term, pattern)) {
 878                         relevance = &wildcards_relevance[i];
 879                         highlight = 1;
 880                         goto relevance_done;
 881                     }
 882                     ++i;
 883                 }
 884
 885                 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
 886                     // Background document model.
 887                     auto bgit = background.find(term);
 888                     if (bgit == background.end()) bgit = background.find(stem);
 889                     if (bgit == background.end()) {
 890                         Xapian::doccount tf = enquire->db.get_termfreq(term);
 891                         if (!tf) {
 892                             tf = enquire->db.get_termfreq(stem);
 893                         } else {
 894                             stem = term;
 895                         }
 896                         double r = 0.0;
 897                         if (tf) {
 898                             // Add one to avoid log(0) when a term indexes all
 899                             // documents.
 900                             Xapian::doccount num_docs = stats->collection_size + 1;
 901                             r = max_tw * log((num_docs - tf) / double(tf));
 902                             r /= (length + 1) * log(double(num_docs));
 903 #if 0
 904                             if (r <= 0) {
 905                                 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
 906                                 while (i != Utf8Iterator()) {
 907                                     if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
 908                                         r = max_tw * 0.05;
 909                                     }
 910                                 }
 911                             }
 912 #endif
 913                         }
 914                         bgit = background.emplace(make_pair(stem, r)).first;
 915                     }
 916                     relevance = &bgit->second;
 917                 }
 918             } else {
 919 #if 0
 920                 // In the absence of weight information, assume longer terms
 921                 // are more relevant, and that unstemmed matches are a bit more
 922                 // relevant than stemmed matches.
 923                 if (queryterms.find(term) != queryterms.end()) {
 924                     relevance = term.size() * 3;
 925                 } else {
 926                     string stem = "Z";
 927                     stem += stemmer(term);
 928                     if (queryterms.find(stem) != queryterms.end()) {
 929                         relevance = term.size() * 2;
 930                     }
 931                 }
 932 #endif
 933             }
 934
 935             // FIXME: Allow Enquire without a DB set or an empty MSet() to be
 936             // used if you don't want the collection model?
 937
 938 #if 0
 939             // FIXME: Punctuation should somehow be included in the model, but this
 940             // approach is problematic - we don't want the first word of a sentence
 941             // to be favoured when it's at the end of the window.
 942
 943             // Give first word in each sentence a relevance boost.
 944             if (term_start == 0) {
 945                 relevance += 10;
 946             } else {
 947                 for (size_t i = term_start; i + term.size() < term_end; ++i) {
 948                     if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
 949                         relevance += 10;
 950                         break;
 951                     }
 952                 }
 953             }
 954 #endif
 955
 956 relevance_done:
 957             if (longest_phrase) {
 958                 phrase[phrase_next] = term;
 959                 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
 960             }
 961
 962             if (highlight) matchfound = true;
 963
 964             if (!snip.pump(relevance, term_end, highlight, flags)) return false;
 965
 966             term_start = term_end;
 967             return true;
 968         });
 969
 970     snip.done();
 971
 972     // Put together the snippet.
 973     string result;
 974     if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
 975         while (snip.drain(text, hi_start, hi_end, omit, result)) { }
 976     }
 977
 978     return result;
 979 }
 980
 981 }