xapian-core/queryparser/termgenerator_internal.cc

   1 /** @file
   2  * @brief TermGenerator class internals
   3  */
   4 /* Copyright (C) 2007-2024 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "termgenerator_internal.h"
  24
  25 #include "api/msetinternal.h"
  26 #include "api/queryinternal.h"
  27
  28 #include <xapian/document.h>
  29 #include <xapian/queryparser.h>
  30 #include <xapian/stem.h>
  31 #include <xapian/unicode.h>
  32
  33 #include "stringutils.h"
  34
  35 #include <algorithm>
  36 #include <cmath>
  37 #include <deque>
  38 #include <limits>
  39 #include <list>
  40 #include <string>
  41 #include <string_view>
  42 #include <unordered_map>
  43 #include <vector>
  44
  45 #include "word-breaker.h"
  46
  47 using namespace std;
  48
  49 namespace Xapian {
  50
  51 static inline bool
  52 U_isupper(unsigned ch)
  53 {
  54     return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
  55 }
  56
  57 static inline unsigned
  58 check_wordchar(unsigned ch)
  59 {
  60     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
  61     return 0;
  62 }
  63
  64 static inline bool
  65 should_stem(const std::string & term)
  66 {
  67     const unsigned int SHOULD_STEM_MASK =
  68         (1 << Unicode::LOWERCASE_LETTER) |
  69         (1 << Unicode::TITLECASE_LETTER) |
  70         (1 << Unicode::MODIFIER_LETTER) |
  71         (1 << Unicode::OTHER_LETTER);
  72     Utf8Iterator u(term);
  73     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
  74 }
  75
  76 /** Value representing "ignore this" when returned by check_infix() or
  77  *  check_infix_digit().
  78  */
  79 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
  80
  81 static inline unsigned
  82 check_infix(unsigned ch)
  83 {
  84     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
  85         // Unicode includes all these except '&' in its word boundary rules,
  86         // as well as 0x2019 (which we handle below) and ':' (for Swedish
  87         // apparently, but we ignore this for now as it's problematic in
  88         // real world cases).
  89         return ch;
  90     }
  91     // 0x2019 is Unicode apostrophe and single closing quote.
  92     // 0x201b is Unicode single opening quote with the tail rising.
  93     if (ch == 0x2019 || ch == 0x201b) return '\'';
  94     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
  95         return UNICODE_IGNORE;
  96     return 0;
  97 }
  98
  99 static inline unsigned
 100 check_infix_digit(unsigned ch)
 101 {
 102     // This list of characters comes from Unicode's word identifying algorithm.
 103     switch (ch) {
 104         case ',':
 105         case '.':
 106         case ';':
 107         case 0x037e: // GREEK QUESTION MARK
 108         case 0x0589: // ARMENIAN FULL STOP
 109         case 0x060D: // ARABIC DATE SEPARATOR
 110         case 0x07F8: // NKO COMMA
 111         case 0x2044: // FRACTION SLASH
 112         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 113         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 114         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 115             return ch;
 116     }
 117     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
 118         return UNICODE_IGNORE;
 119     return 0;
 120 }
 121
 122 static inline bool
 123 is_digit(unsigned ch) {
 124     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
 125 }
 126
 127 static inline unsigned
 128 check_suffix(unsigned ch)
 129 {
 130     if (ch == '+' || ch == '#') return ch;
 131     // FIXME: what about '-'?
 132     return 0;
 133 }
 134
 135 static_assert(int(MSet::SNIPPET_WORD_BREAKS) == TermGenerator::FLAG_WORD_BREAKS,
 136               "WORD_BREAKS flags have same value");
 137
 138 template<typename ACTION>
 139 static bool
 140 break_words(Utf8Iterator& itor, unsigned break_flags, bool with_positions,
 141             ACTION action)
 142 {
 143 #ifdef USE_ICU
 144     if (break_flags & MSet::SNIPPET_WORD_BREAKS) {
 145         const char* start = itor.raw();
 146         // get_unbroken() returns the number of codepoints, which we aren't
 147         // interested in here.
 148         (void)get_unbroken(itor);
 149         size_t left = itor.raw() - start;
 150         for (WordIterator tk(start, left); tk != WordIterator(); ++tk) {
 151             const string& token = *tk;
 152             left -= token.length();
 153             if (!action(token, with_positions, itor.left() + left))
 154                 return false;
 155         }
 156         return true;
 157     }
 158 #else
 159     (void)break_flags;
 160 #endif
 161
 162     NgramIterator tk(itor);
 163     while (tk != NgramIterator()) {
 164         const string& token = *tk;
 165         // FLAG_NGRAMS only sets positions for tokens of length 1.
 166         bool with_pos = with_positions && tk.unigram();
 167         if (!action(token, with_pos, tk.get_utf8iterator().left()))
 168             return false;
 169         ++tk;
 170     }
 171     // Update itor to point the end of the span of text in an unbroken script.
 172     itor = tk.get_utf8iterator();
 173     return true;
 174 }
 175
 176 /** Templated framework for processing terms.
 177  *
 178  *  Calls action(term, positional) for each term to add, where term is a
 179  *  std::string holding the term, and positional is a bool indicating
 180  *  if this term carries positional information.
 181  */
 182 template<typename ACTION>
 183 static void
 184 parse_terms(Utf8Iterator itor, unsigned break_flags, bool with_positions,
 185             ACTION action)
 186 {
 187     while (true) {
 188         // Advance to the start of the next term.
 189         unsigned ch;
 190         while (true) {
 191             if (itor == Utf8Iterator()) return;
 192             ch = check_wordchar(*itor);
 193             if (ch) break;
 194             ++itor;
 195         }
 196
 197         string term;
 198         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
 199         // Don't worry if there's a trailing '.' or not.
 200         if (U_isupper(*itor)) {
 201             const Utf8Iterator end;
 202             Utf8Iterator p = itor;
 203             do {
 204                 Unicode::append_utf8(term, Unicode::tolower(*p++));
 205             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
 206             // One letter does not make an acronym!  If we handled a single
 207             // uppercase letter here, we wouldn't catch M&S below.
 208             if (term.size() > 1) {
 209                 // Check there's not a (lower case) letter or digit
 210                 // immediately after it.
 211                 if (p == end || !Unicode::is_wordchar(*p)) {
 212                     itor = p;
 213                     goto endofterm;
 214                 }
 215             }
 216             term.resize(0);
 217         }
 218
 219         while (true) {
 220             if (break_flags && is_unbroken_wordchar(*itor)) {
 221                 if (!break_words(itor, break_flags, with_positions, action))
 222                     return;
 223                 while (true) {
 224                     if (itor == Utf8Iterator()) return;
 225                     ch = check_wordchar(*itor);
 226                     if (ch) break;
 227                     ++itor;
 228                 }
 229                 continue;
 230             }
 231             unsigned prevch;
 232             do {
 233                 Unicode::append_utf8(term, ch);
 234                 prevch = ch;
 235                 if (++itor == Utf8Iterator() ||
 236                     (break_flags && is_unbroken_script(*itor)))
 237                     goto endofterm;
 238                 ch = check_wordchar(*itor);
 239             } while (ch);
 240
 241             Utf8Iterator next(itor);
 242             ++next;
 243             if (next == Utf8Iterator()) break;
 244             unsigned nextch = check_wordchar(*next);
 245             if (!nextch) break;
 246             unsigned infix_ch = *itor;
 247             if (is_digit(prevch) && is_digit(*next)) {
 248                 infix_ch = check_infix_digit(infix_ch);
 249             } else {
 250                 // Handle things like '&' in AT&T, apostrophes, etc.
 251                 infix_ch = check_infix(infix_ch);
 252             }
 253             if (!infix_ch) break;
 254             if (infix_ch != UNICODE_IGNORE)
 255                 Unicode::append_utf8(term, infix_ch);
 256             ch = nextch;
 257             itor = next;
 258         }
 259
 260         {
 261             size_t len = term.size();
 262             unsigned count = 0;
 263             while ((ch = check_suffix(*itor))) {
 264                 if (++count > 3) {
 265                     term.resize(len);
 266                     break;
 267                 }
 268                 Unicode::append_utf8(term, ch);
 269                 if (++itor == Utf8Iterator()) goto endofterm;
 270             }
 271             // Don't index fish+chips as fish+ chips.
 272             if (Unicode::is_wordchar(*itor))
 273                 term.resize(len);
 274         }
 275
 276 endofterm:
 277         if (!action(term, with_positions, itor.left()))
 278             return;
 279     }
 280 }
 281
 282 void
 283 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
 284                                     string_view prefix, bool with_positions)
 285 {
 286 #ifndef USE_ICU
 287     if (flags & FLAG_WORD_BREAKS) {
 288         throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
 289                                               "building Xapian to use ICU");
 290     }
 291 #endif
 292     unsigned break_flags = flags & (FLAG_NGRAMS | FLAG_WORD_BREAKS);
 293     if (break_flags == 0 && is_ngram_enabled()) {
 294         break_flags = FLAG_NGRAMS;
 295     }
 296
 297     stop_strategy current_stop_mode;
 298     if (!stopper) {
 299         current_stop_mode = TermGenerator::STOP_NONE;
 300     } else {
 301         current_stop_mode = stop_mode;
 302     }
 303
 304     // Create two std::string objects which we effectively use as buffers to
 305     // build the prefixed terms we generate (one for non-stemmed and one for
 306     // stemmed terms).  This is a simple way to avoid creating a temporary
 307     // std::string object each time we generate a prefixed term.
 308     string prefixed_term;
 309     auto prefix_size = prefix.size();
 310     prefixed_term.reserve(prefix_size + max_word_length);
 311     prefixed_term.assign(prefix);
 312
 313     string prefixed_stemmed_term;
 314     int add_z = (strategy != TermGenerator::STEM_ALL);
 315     prefixed_stemmed_term.reserve(add_z + prefix_size + max_word_length);
 316     if (add_z) prefixed_stemmed_term.assign(1, 'Z');
 317     prefixed_stemmed_term.append(prefix);
 318     auto prefixed_stemmed_size = prefixed_stemmed_term.size();
 319
 320     parse_terms(itor, break_flags, with_positions,
 321         [=, &prefixed_term, &prefixed_stemmed_term
 322 #if __cplusplus >= 201907L
 323 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
 324 // don't allow `this` here.
 325         , this
 326 #endif
 327         ](const string & term, bool positional, size_t) {
 328             if (term.size() > max_word_length) return true;
 329
 330             if (current_stop_mode == TermGenerator::STOP_ALL &&
 331                 (*stopper)(term)) {
 332                 return true;
 333             }
 334
 335             if (strategy == TermGenerator::STEM_SOME ||
 336                 strategy == TermGenerator::STEM_NONE ||
 337                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
 338                 prefixed_term.append(term);
 339                 if (positional) {
 340                     if (rare(cur_pos >= pos_limit))
 341                         throw Xapian::RangeError("termpos limit exceeded");
 342                     doc.add_posting(prefixed_term, ++cur_pos, wdf_inc);
 343                 } else {
 344                     doc.add_term(prefixed_term, wdf_inc);
 345                 }
 346                 prefixed_term.resize(prefix_size);
 347             }
 348
 349             // MSVC seems to need "this->" on member variables in this
 350             // situation.
 351             if ((this->flags & FLAG_SPELLING) && prefix_size == 0)
 352                 db.add_spelling(term);
 353
 354             if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
 355                 return true;
 356
 357             if (strategy == TermGenerator::STEM_SOME ||
 358                 strategy == TermGenerator::STEM_SOME_FULL_POS) {
 359                 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
 360                     (*stopper)(term))
 361                     return true;
 362
 363                 // Note, this uses the lowercased term, but that's OK as we
 364                 // only want to avoid stemming terms starting with a digit.
 365                 if (!should_stem(term)) return true;
 366             }
 367
 368             // Add stemmed form without positional information.
 369             const string& stem = stemmer(term);
 370             if (rare(stem.empty())) return true;
 371             prefixed_stemmed_term.append(stem);
 372             if (strategy != TermGenerator::STEM_SOME && positional) {
 373                 if (strategy != TermGenerator::STEM_SOME_FULL_POS) {
 374                     if (rare(cur_pos >= pos_limit))
 375                         throw Xapian::RangeError("termpos limit exceeded");
 376                     ++cur_pos;
 377                 }
 378                 doc.add_posting(prefixed_stemmed_term, cur_pos, wdf_inc);
 379             } else {
 380                 doc.add_term(prefixed_stemmed_term, wdf_inc);
 381             }
 382             prefixed_stemmed_term.resize(prefixed_stemmed_size);
 383
 384             return true;
 385         });
 386 }
 387
 388 struct Sniplet {
 389     double* relevance;
 390
 391     size_t term_end;
 392
 393     size_t highlight;
 394
 395     Sniplet(double* r, size_t t, size_t h)
 396         : relevance(r), term_end(t), highlight(h) { }
 397 };
 398
 399 class SnipPipe {
 400     deque<Sniplet> pipe;
 401     deque<Sniplet> best_pipe;
 402
 403     // Requested length for snippet.
 404     size_t length;
 405
 406     // Position in text of start of current pipe contents.
 407     size_t begin = 0;
 408
 409     // Rolling sum of the current pipe contents.
 410     double sum = 0;
 411
 412     size_t phrase_len = 0;
 413
 414   public:
 415     size_t best_begin = 0;
 416
 417     size_t best_end = 0;
 418
 419     double best_sum = 0;
 420
 421     // Add one to length to allow for inter-word space.
 422     // FIXME: We ought to correctly allow for multiple spaces.
 423     explicit SnipPipe(size_t length_) : length(length_ + 1) { }
 424
 425     bool pump(double* r, size_t t, size_t h, unsigned flags);
 426
 427     void done();
 428
 429     bool drain(string_view input,
 430                string_view hi_start,
 431                string_view hi_end,
 432                string_view omit,
 433                string& output);
 434 };
 435
 436 #define DECAY 2.0
 437
 438 inline bool
 439 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
 440 {
 441     if (h > 1) {
 442         if (pipe.size() >= h - 1) {
 443             // The final term of a phrase is entering the window.  Peg the
 444             // phrase's relevance onto the first term of the phrase, so it'll
 445             // be removed from `sum` when the phrase starts to leave the
 446             // window.
 447             auto & phrase_start = pipe[pipe.size() - (h - 1)];
 448             if (phrase_start.relevance) {
 449                 *phrase_start.relevance *= DECAY;
 450                 sum -= *phrase_start.relevance;
 451             }
 452             sum += *r;
 453             phrase_start.relevance = r;
 454             phrase_start.highlight = h;
 455             *r /= DECAY;
 456         }
 457         r = NULL;
 458         h = 0;
 459     }
 460     pipe.emplace_back(r, t, h);
 461     if (r) {
 462         sum += *r;
 463         *r /= DECAY;
 464     }
 465
 466     // If necessary, discard words from the start of the pipe until it has the
 467     // desired length.
 468     // FIXME: Also shrink the window past words with relevance < 0?
 469     while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
 470         const Sniplet& word = pipe.front();
 471         if (word.relevance) {
 472             *word.relevance *= DECAY;
 473             sum -= *word.relevance;
 474         }
 475         begin = word.term_end;
 476         if (best_end >= begin)
 477             best_pipe.push_back(word);
 478         pipe.pop_front();
 479         // E.g. can happen if the current term is longer than the requested
 480         // length!
 481         if (rare(pipe.empty())) break;
 482     }
 483
 484     // Using > here doesn't work well, as we don't extend a snippet over terms
 485     // with 0 weight.
 486     if (sum >= best_sum) {
 487         // Discard any part of `best_pipe` which is before `begin`.
 488         if (begin >= best_end) {
 489             best_pipe.clear();
 490         } else {
 491             while (!best_pipe.empty() &&
 492                    best_pipe.front().term_end <= begin) {
 493                 best_pipe.pop_front();
 494             }
 495         }
 496         best_sum = sum;
 497         best_begin = begin;
 498         best_end = t;
 499     } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
 500         if (best_sum > 0 && best_end < begin) {
 501             // We found something, and we aren't still looking near it.
 502             // FIXME: Benchmark this and adjust if necessary.
 503             return false;
 504         }
 505     }
 506     return true;
 507 }
 508
 509 inline void
 510 SnipPipe::done()
 511 {
 512     // Discard any part of `pipe` which is after `best_end`.
 513     if (begin >= best_end) {
 514         pipe.clear();
 515     } else {
 516         // We should never empty the pipe (as that case should be handled
 517         // above).
 518         while (rare(!pipe.empty()) &&
 519                pipe.back().term_end > best_end) {
 520             pipe.pop_back();
 521         }
 522     }
 523 }
 524
 525 // Check if a non-word character is should be included at the start of the
 526 // snippet.  We want to include certain leading non-word characters, but not
 527 // others.
 528 static inline bool
 529 snippet_check_leading_nonwordchar(unsigned ch) {
 530     if (Unicode::is_currency(ch) ||
 531         Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
 532         Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
 533         return true;
 534     }
 535     switch (ch) {
 536         case '"':
 537         case '#':
 538         case '%':
 539         case '&':
 540         case '\'':
 541         case '+':
 542         case '-':
 543         case '/':
 544         case '<':
 545         case '@':
 546         case '\\':
 547         case '`':
 548         case '~':
 549         case 0x00A1: // INVERTED EXCLAMATION MARK
 550         case 0x00A7: // SECTION SIGN
 551         case 0x00BF: // INVERTED QUESTION MARK
 552             return true;
 553     }
 554     return false;
 555 }
 556
 557 // Check if a non-word character is should be included at the end of the
 558 // snippet.  We want to include certain trailing non-word characters, but not
 559 // others.
 560 static inline bool
 561 snippet_check_trailing_nonwordchar(unsigned ch) {
 562     if (Unicode::is_currency(ch) ||
 563         Unicode::get_category(ch) == Unicode::CLOSE_PUNCTUATION ||
 564         Unicode::get_category(ch) == Unicode::FINAL_QUOTE_PUNCTUATION) {
 565         return true;
 566     }
 567     switch (ch) {
 568         case '"':
 569         case '%':
 570         case '\'':
 571         case '+':
 572         case '-':
 573         case '/':
 574         case '>':
 575         case '@':
 576         case '\\':
 577         case '`':
 578         case '~':
 579             return true;
 580     }
 581     return false;
 582 }
 583
 584 static inline void
 585 append_escaping_xml(const char* p, const char* end, string& output)
 586 {
 587     while (p != end) {
 588         char ch = *p++;
 589         switch (ch) {
 590             case '&':
 591                 output += "&amp;";
 592                 break;
 593             case '<':
 594                 output += "&lt;";
 595                 break;
 596             case '>':
 597                 output += "&gt;";
 598                 break;
 599             default:
 600                 output += ch;
 601         }
 602     }
 603 }
 604
 605 inline bool
 606 SnipPipe::drain(string_view input,
 607                 string_view hi_start,
 608                 string_view hi_end,
 609                 string_view omit,
 610                 string& output)
 611 {
 612     if (best_pipe.empty() && !pipe.empty()) {
 613         swap(best_pipe, pipe);
 614     }
 615
 616     if (best_pipe.empty()) {
 617         size_t tail_len = input.size() - best_end;
 618         if (tail_len == 0) return false;
 619
 620         // See if this is the end of a sentence.
 621         // FIXME: This is quite simplistic - look at the Unicode rules:
 622         // https://unicode.org/reports/tr29/#Sentence_Boundaries
 623         bool sentence_end = false;
 624         Utf8Iterator i(input.data() + best_end, tail_len);
 625         while (i != Utf8Iterator()) {
 626             unsigned ch = *i;
 627             if (sentence_end && Unicode::is_whitespace(ch)) break;
 628
 629             // Allow "...", "!!", "!?!", etc...
 630             sentence_end = (ch == '.' || ch == '?' || ch == '!');
 631
 632             if (Unicode::is_wordchar(ch)) break;
 633             ++i;
 634         }
 635
 636         if (sentence_end) {
 637             // Include end of sentence punctuation.
 638             append_escaping_xml(input.data() + best_end, i.raw(), output);
 639             return false;
 640         }
 641
 642         // Include trailing punctuation which includes meaning or context.
 643         i.assign(input.data() + best_end, tail_len);
 644         int trailing_punc = 0;
 645         while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
 646             // But limit how much trailing punctuation we include.
 647             if (++trailing_punc > 4) {
 648                 trailing_punc = 0;
 649                 break;
 650             }
 651             ++i;
 652         }
 653         if (trailing_punc) {
 654             append_escaping_xml(input.data() + best_end, i.raw(), output);
 655             if (i == Utf8Iterator()) return false;
 656         }
 657
 658         // Append "..." or equivalent as this doesn't seem to be the start
 659         // of a sentence.
 660         output += omit;
 661
 662         return false;
 663     }
 664
 665     const Sniplet & word = best_pipe.front();
 666
 667     if (output.empty()) {
 668         // Start of the snippet.
 669         enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
 670
 671         Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
 672         while (i != Utf8Iterator()) {
 673             unsigned ch = *i;
 674             switch (sentence_boundary) {
 675                 case NO:
 676                     if (ch == '.' || ch == '?' || ch == '!') {
 677                         sentence_boundary = PUNC;
 678                     }
 679                     break;
 680                 case PUNC:
 681                     if (Unicode::is_whitespace(ch)) {
 682                         sentence_boundary = YES;
 683                     } else if (ch == '.' || ch == '?' || ch == '!') {
 684                         // Allow "...", "!!", "!?!", etc...
 685                     } else {
 686                         sentence_boundary = NO;
 687                     }
 688                     break;
 689                 case YES:
 690                     break;
 691             }
 692
 693             // Start the snippet at the start of the first word, but include
 694             // certain punctuation too.
 695             if (Unicode::is_wordchar(ch)) {
 696                 // But limit how much leading punctuation we include.
 697                 size_t word_begin = i.raw() - input.data();
 698                 if (word_begin - best_begin > 4) {
 699                     best_begin = word_begin;
 700                 }
 701                 break;
 702             }
 703             ++i;
 704             if (!snippet_check_leading_nonwordchar(ch)) {
 705                 best_begin = i.raw() - input.data();
 706             }
 707         }
 708
 709         // Add "..." or equivalent if this doesn't seem to be the start of a
 710         // sentence.
 711         if (sentence_boundary != YES) {
 712             output += omit;
 713         }
 714     }
 715
 716     if (word.highlight) {
 717         // Don't include inter-word characters in the highlight.
 718         Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
 719         while (i != Utf8Iterator()) {
 720             unsigned ch = *i;
 721             if (Unicode::is_wordchar(ch)) {
 722                 append_escaping_xml(input.data() + best_begin, i.raw(), output);
 723                 best_begin = i.raw() - input.data();
 724                 break;
 725             }
 726             ++i;
 727         }
 728     }
 729
 730     if (!phrase_len) {
 731         phrase_len = word.highlight;
 732         if (phrase_len) output += hi_start;
 733     }
 734
 735     const char* p = input.data();
 736     append_escaping_xml(p + best_begin, p + word.term_end, output);
 737     best_begin = word.term_end;
 738
 739     if (phrase_len && --phrase_len == 0) output += hi_end;
 740
 741     best_pipe.pop_front();
 742     return true;
 743 }
 744
 745 static void
 746 check_query(const Xapian::Query & query,
 747             list<vector<string>> & exact_phrases,
 748             unordered_map<string, double> & loose_terms,
 749             list<const Xapian::Internal::QueryWildcard*> & wildcards,
 750             list<const Xapian::Internal::QueryEditDistance*> & fuzzies,
 751             size_t & longest_phrase)
 752 {
 753     // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
 754     size_t n_subqs = query.get_num_subqueries();
 755     Xapian::Query::op op = query.get_type();
 756     if (op == query.LEAF_TERM) {
 757         const Xapian::Internal::QueryTerm & qt =
 758             *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
 759         loose_terms.insert(make_pair(qt.get_term(), 0));
 760     } else if (op == query.OP_WILDCARD) {
 761         using Xapian::Internal::QueryWildcard;
 762         const QueryWildcard* qw =
 763             static_cast<const QueryWildcard*>(query.internal.get());
 764         wildcards.push_back(qw);
 765     } else if (op == query.OP_EDIT_DISTANCE) {
 766         using Xapian::Internal::QueryEditDistance;
 767         const QueryEditDistance* qed =
 768             static_cast<const QueryEditDistance*>(query.internal.get());
 769         fuzzies.push_back(qed);
 770     } else if (op == query.OP_PHRASE) {
 771         const Xapian::Internal::QueryPhrase & phrase =
 772             *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
 773         if (phrase.get_window() == n_subqs) {
 774             // Tight phrase.
 775             for (size_t i = 0; i != n_subqs; ++i) {
 776                 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
 777                     goto non_term_subquery;
 778             }
 779
 780             // Tight phrase of terms.
 781             exact_phrases.push_back(vector<string>());
 782             vector<string> & terms = exact_phrases.back();
 783             terms.reserve(n_subqs);
 784             for (size_t i = 0; i != n_subqs; ++i) {
 785                 Xapian::Query q = query.get_subquery(i);
 786                 const Xapian::Internal::QueryTerm & qt =
 787                     *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
 788                 terms.push_back(qt.get_term());
 789             }
 790             if (n_subqs > longest_phrase) longest_phrase = n_subqs;
 791             return;
 792         }
 793     }
 794 non_term_subquery:
 795     for (size_t i = 0; i != n_subqs; ++i)
 796         check_query(query.get_subquery(i), exact_phrases, loose_terms,
 797                     wildcards, fuzzies, longest_phrase);
 798 }
 799
 800 static double*
 801 check_term(unordered_map<string, double> & loose_terms,
 802            const Xapian::Weight::Internal * stats,
 803            const string & term,
 804            double max_tw)
 805 {
 806     auto it = loose_terms.find(term);
 807     if (it == loose_terms.end()) return NULL;
 808
 809     if (it->second == 0.0) {
 810         double relevance;
 811         if (!stats->get_termweight(term, relevance)) {
 812             // FIXME: Assert?
 813             loose_terms.erase(it);
 814             return NULL;
 815         }
 816
 817         it->second = relevance + max_tw;
 818     }
 819     return &it->second;
 820 }
 821
 822 string
 823 MSet::Internal::snippet(string_view text,
 824                         size_t length,
 825                         const Xapian::Stem & stemmer,
 826                         unsigned flags,
 827                         string_view hi_start,
 828                         string_view hi_end,
 829                         string_view omit) const
 830 {
 831     if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
 832         // Too easy!
 833         return string{text};
 834     }
 835
 836 #ifndef USE_ICU
 837     if (flags & MSet::SNIPPET_WORD_BREAKS) {
 838         throw Xapian::FeatureUnavailableError("SNIPPET_WORD_BREAKS requires "
 839                                               "building Xapian to use ICU");
 840     }
 841 #endif
 842     auto SNIPPET_BREAK_MASK = MSet::SNIPPET_NGRAMS | MSet::SNIPPET_WORD_BREAKS;
 843     unsigned break_flags = flags & SNIPPET_BREAK_MASK;
 844     if (break_flags == 0 && is_ngram_enabled()) {
 845         break_flags = MSet::SNIPPET_NGRAMS;
 846     }
 847
 848     size_t term_start = 0;
 849     double min_tw = 0, max_tw = 0;
 850     if (stats) stats->get_max_termweight(min_tw, max_tw);
 851     if (max_tw == 0.0) {
 852         max_tw = 1.0;
 853     } else {
 854         // Scale up by (1 + 1/64) so that highlighting works better for terms
 855         // with termweight 0 (which happens for terms not in the database, and
 856         // also with some weighting schemes for terms which occur in almost all
 857         // documents.
 858         max_tw *= 1.015625;
 859     }
 860
 861     Xapian::Query query;
 862     if (enquire) {
 863         query = enquire->query;
 864     }
 865     SnipPipe snip(length);
 866
 867     list<vector<string>> exact_phrases;
 868     unordered_map<string, double> loose_terms;
 869     list<const Xapian::Internal::QueryWildcard*> wildcards;
 870     list<const Xapian::Internal::QueryEditDistance*> fuzzies;
 871     size_t longest_phrase = 0;
 872     check_query(query, exact_phrases, loose_terms,
 873                 wildcards, fuzzies, longest_phrase);
 874
 875     vector<double> exact_phrases_relevance;
 876     exact_phrases_relevance.reserve(exact_phrases.size());
 877     for (auto&& terms : exact_phrases) {
 878         // FIXME: What relevance to use?
 879         exact_phrases_relevance.push_back(max_tw * terms.size());
 880     }
 881
 882     vector<double> wildcards_relevance;
 883     wildcards_relevance.reserve(wildcards.size());
 884     for (auto&& pattern : wildcards) {
 885         // FIXME: What relevance to use?
 886         (void)pattern;
 887         wildcards_relevance.push_back(max_tw + min_tw);
 888     }
 889
 890     vector<double> fuzzies_relevance;
 891     fuzzies_relevance.reserve(fuzzies.size());
 892     for (auto&& pattern : fuzzies) {
 893         // FIXME: What relevance to use?
 894         (void)pattern;
 895         fuzzies_relevance.push_back(max_tw + min_tw);
 896     }
 897
 898     // Background relevance is the same for a given MSet, so cache it
 899     // between calls to MSet::snippet() on the same object.
 900     unordered_map<string, double>& background = snippet_bg_relevance;
 901
 902     vector<string> phrase;
 903     if (longest_phrase) phrase.resize(longest_phrase - 1);
 904     size_t phrase_next = 0;
 905     bool matchfound = false;
 906     parse_terms(Utf8Iterator(text), break_flags, true,
 907         [&](const string & term, bool positional, size_t left) {
 908             // FIXME: Don't hardcode this here.
 909             const size_t max_word_length = 64;
 910
 911             if (!positional) return true;
 912             if (term.size() > max_word_length) return true;
 913
 914             // We get segments with any "inter-word" characters in front of
 915             // each word, e.g.:
 916             // [The][ cat][ sat][ on][ the][ mat]
 917             size_t term_end = text.size() - left;
 918
 919             double* relevance = 0;
 920             size_t highlight = 0;
 921             if (stats) {
 922                 size_t i = 0;
 923                 for (auto&& terms : exact_phrases) {
 924                     if (term == terms.back()) {
 925                         size_t n = terms.size() - 1;
 926                         bool match = true;
 927                         while (UNSIGNED_OVERFLOW_OK(n--)) {
 928                             if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
 929                                 match = false;
 930                                 break;
 931                             }
 932                         }
 933                         if (match) {
 934                             // FIXME: Sort phrases, highest score first!
 935                             relevance = &exact_phrases_relevance[i];
 936                             highlight = terms.size();
 937                             goto relevance_done;
 938                         }
 939                     }
 940                     ++i;
 941                 }
 942
 943                 relevance = check_term(loose_terms, stats.get(), term, max_tw);
 944                 if (relevance) {
 945                     // Matched unstemmed term.
 946                     highlight = 1;
 947                     goto relevance_done;
 948                 }
 949
 950                 string stem = "Z";
 951                 stem += stemmer(term);
 952                 relevance = check_term(loose_terms, stats.get(), stem, max_tw);
 953                 if (relevance) {
 954                     // Matched stemmed term.
 955                     highlight = 1;
 956                     goto relevance_done;
 957                 }
 958
 959                 // Check wildcards.
 960                 // FIXME: Sort wildcards, cheapest to check first or something?
 961                 i = 0;
 962                 for (auto&& qw : wildcards) {
 963                     if (qw->test(term)) {
 964                         relevance = &wildcards_relevance[i];
 965                         highlight = 1;
 966                         goto relevance_done;
 967                     }
 968                     ++i;
 969                 }
 970
 971                 // Check fuzzies.
 972                 // FIXME: Sort fuzzies, cheapest to check first or something?
 973                 i = 0;
 974                 for (auto&& qed : fuzzies) {
 975                     // test() returns 0 for no match, or edit_distance + 1.
 976                     int ed_result = qed->test(term);
 977                     if (ed_result) {
 978                         // FIXME: Reduce relevance the more edits there are?
 979                         // We can't just divide by ed_result here as this
 980                         // relevance is used by any term matching this
 981                         // subquery.
 982                         relevance = &fuzzies_relevance[i];
 983                         highlight = 1;
 984                         goto relevance_done;
 985                     }
 986                     ++i;
 987                 }
 988
 989                 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
 990                     // Background document model.
 991                     auto bgit = background.find(term);
 992                     if (bgit == background.end()) bgit = background.find(stem);
 993                     if (bgit == background.end()) {
 994                         Xapian::doccount tf = enquire->db.get_termfreq(term);
 995                         if (!tf) {
 996                             tf = enquire->db.get_termfreq(stem);
 997                         } else {
 998                             stem = term;
 999                         }
1000                         double r = 0.0;
1001                         if (tf) {
1002                             // Add one to avoid log(0) when a term indexes all
1003                             // documents.
1004                             Xapian::doccount num_docs = stats->collection_size + 1;
1005                             r = max_tw * log((num_docs - tf) / double(tf));
1006                             r /= (length + 1) * log(double(num_docs));
1007 #if 0
1008                             if (r <= 0) {
1009                                 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
1010                                 while (i != Utf8Iterator()) {
1011                                     if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
1012                                         r = max_tw * 0.05;
1013                                     }
1014                                 }
1015                             }
1016 #endif
1017                         }
1018                         bgit = background.emplace(make_pair(stem, r)).first;
1019                     }
1020                     relevance = &bgit->second;
1021                 }
1022             } else {
1023 #if 0
1024                 // In the absence of weight information, assume longer terms
1025                 // are more relevant, and that unstemmed matches are a bit more
1026                 // relevant than stemmed matches.
1027                 if (queryterms.find(term) != queryterms.end()) {
1028                     relevance = term.size() * 3;
1029                 } else {
1030                     string stem = "Z";
1031                     stem += stemmer(term);
1032                     if (queryterms.find(stem) != queryterms.end()) {
1033                         relevance = term.size() * 2;
1034                     }
1035                 }
1036 #endif
1037             }
1038
1039             // FIXME: Allow Enquire without a DB set or an empty MSet() to be
1040             // used if you don't want the collection model?
1041
1042 #if 0
1043             // FIXME: Punctuation should somehow be included in the model, but this
1044             // approach is problematic - we don't want the first word of a sentence
1045             // to be favoured when it's at the end of the window.
1046
1047             // Give first word in each sentence a relevance boost.
1048             if (term_start == 0) {
1049                 relevance += 10;
1050             } else {
1051                 for (size_t i = term_start; i + term.size() < term_end; ++i) {
1052                     if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
1053                         relevance += 10;
1054                         break;
1055                     }
1056                 }
1057             }
1058 #endif
1059
1060 relevance_done:
1061             if (longest_phrase) {
1062                 phrase[phrase_next] = term;
1063                 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
1064             }
1065
1066             if (highlight) matchfound = true;
1067
1068             if (!snip.pump(relevance, term_end, highlight, flags)) return false;
1069
1070             term_start = term_end;
1071             return true;
1072         });
1073
1074     snip.done();
1075
1076     // Put together the snippet.
1077     string result;
1078     if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
1079         while (snip.drain(text, hi_start, hi_end, omit, result)) { }
1080     }
1081
1082     return result;
1083 }
1084
1085 }