[ci] Fix clang-santisers job for GHA change
[xapian.git] / xapian-core / queryparser / termgenerator_internal.cc
blob14388a072513f8d6fbedd88a9a1c1cab5c3abaad
1 /** @file
2 * @brief TermGenerator class internals
3 */
4 /* Copyright (C) 2007-2024 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "termgenerator_internal.h"
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
35 #include <algorithm>
36 #include <cmath>
37 #include <deque>
38 #include <limits>
39 #include <list>
40 #include <string>
41 #include <string_view>
42 #include <unordered_map>
43 #include <vector>
45 #include "word-breaker.h"
47 using namespace std;
49 namespace Xapian {
51 static inline bool
52 U_isupper(unsigned ch)
54 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
57 static inline unsigned
58 check_wordchar(unsigned ch)
60 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
61 return 0;
64 static inline bool
65 should_stem(const std::string & term)
67 const unsigned int SHOULD_STEM_MASK =
68 (1 << Unicode::LOWERCASE_LETTER) |
69 (1 << Unicode::TITLECASE_LETTER) |
70 (1 << Unicode::MODIFIER_LETTER) |
71 (1 << Unicode::OTHER_LETTER);
72 Utf8Iterator u(term);
73 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
76 /** Value representing "ignore this" when returned by check_infix() or
77 * check_infix_digit().
79 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
81 static inline unsigned
82 check_infix(unsigned ch)
84 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
85 // Unicode includes all these except '&' in its word boundary rules,
86 // as well as 0x2019 (which we handle below) and ':' (for Swedish
87 // apparently, but we ignore this for now as it's problematic in
88 // real world cases).
89 return ch;
91 // 0x2019 is Unicode apostrophe and single closing quote.
92 // 0x201b is Unicode single opening quote with the tail rising.
93 if (ch == 0x2019 || ch == 0x201b) return '\'';
94 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
95 return UNICODE_IGNORE;
96 return 0;
99 static inline unsigned
100 check_infix_digit(unsigned ch)
102 // This list of characters comes from Unicode's word identifying algorithm.
103 switch (ch) {
104 case ',':
105 case '.':
106 case ';':
107 case 0x037e: // GREEK QUESTION MARK
108 case 0x0589: // ARMENIAN FULL STOP
109 case 0x060D: // ARABIC DATE SEPARATOR
110 case 0x07F8: // NKO COMMA
111 case 0x2044: // FRACTION SLASH
112 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
113 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
114 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
115 return ch;
117 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
118 return UNICODE_IGNORE;
119 return 0;
122 static inline bool
123 is_digit(unsigned ch) {
124 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
127 static inline unsigned
128 check_suffix(unsigned ch)
130 if (ch == '+' || ch == '#') return ch;
131 // FIXME: what about '-'?
132 return 0;
135 static_assert(int(MSet::SNIPPET_WORD_BREAKS) == TermGenerator::FLAG_WORD_BREAKS,
136 "WORD_BREAKS flags have same value");
138 template<typename ACTION>
139 static bool
140 break_words(Utf8Iterator& itor, unsigned break_flags, bool with_positions,
141 ACTION action)
143 #ifdef USE_ICU
144 if (break_flags & MSet::SNIPPET_WORD_BREAKS) {
145 const char* start = itor.raw();
146 // get_unbroken() returns the number of codepoints, which we aren't
147 // interested in here.
148 (void)get_unbroken(itor);
149 size_t left = itor.raw() - start;
150 for (WordIterator tk(start, left); tk != WordIterator(); ++tk) {
151 const string& token = *tk;
152 left -= token.length();
153 if (!action(token, with_positions, itor.left() + left))
154 return false;
156 return true;
158 #else
159 (void)break_flags;
160 #endif
162 NgramIterator tk(itor);
163 while (tk != NgramIterator()) {
164 const string& token = *tk;
165 // FLAG_NGRAMS only sets positions for tokens of length 1.
166 bool with_pos = with_positions && tk.unigram();
167 if (!action(token, with_pos, tk.get_utf8iterator().left()))
168 return false;
169 ++tk;
171 // Update itor to point the end of the span of text in an unbroken script.
172 itor = tk.get_utf8iterator();
173 return true;
176 /** Templated framework for processing terms.
178 * Calls action(term, positional) for each term to add, where term is a
179 * std::string holding the term, and positional is a bool indicating
180 * if this term carries positional information.
182 template<typename ACTION>
183 static void
184 parse_terms(Utf8Iterator itor, unsigned break_flags, bool with_positions,
185 ACTION action)
187 while (true) {
188 // Advance to the start of the next term.
189 unsigned ch;
190 while (true) {
191 if (itor == Utf8Iterator()) return;
192 ch = check_wordchar(*itor);
193 if (ch) break;
194 ++itor;
197 string term;
198 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
199 // Don't worry if there's a trailing '.' or not.
200 if (U_isupper(*itor)) {
201 const Utf8Iterator end;
202 Utf8Iterator p = itor;
203 do {
204 Unicode::append_utf8(term, Unicode::tolower(*p++));
205 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
206 // One letter does not make an acronym! If we handled a single
207 // uppercase letter here, we wouldn't catch M&S below.
208 if (term.size() > 1) {
209 // Check there's not a (lower case) letter or digit
210 // immediately after it.
211 if (p == end || !Unicode::is_wordchar(*p)) {
212 itor = p;
213 goto endofterm;
216 term.resize(0);
219 while (true) {
220 if (break_flags && is_unbroken_wordchar(*itor)) {
221 if (!break_words(itor, break_flags, with_positions, action))
222 return;
223 while (true) {
224 if (itor == Utf8Iterator()) return;
225 ch = check_wordchar(*itor);
226 if (ch) break;
227 ++itor;
229 continue;
231 unsigned prevch;
232 do {
233 Unicode::append_utf8(term, ch);
234 prevch = ch;
235 if (++itor == Utf8Iterator() ||
236 (break_flags && is_unbroken_script(*itor)))
237 goto endofterm;
238 ch = check_wordchar(*itor);
239 } while (ch);
241 Utf8Iterator next(itor);
242 ++next;
243 if (next == Utf8Iterator()) break;
244 unsigned nextch = check_wordchar(*next);
245 if (!nextch) break;
246 unsigned infix_ch = *itor;
247 if (is_digit(prevch) && is_digit(*next)) {
248 infix_ch = check_infix_digit(infix_ch);
249 } else {
250 // Handle things like '&' in AT&T, apostrophes, etc.
251 infix_ch = check_infix(infix_ch);
253 if (!infix_ch) break;
254 if (infix_ch != UNICODE_IGNORE)
255 Unicode::append_utf8(term, infix_ch);
256 ch = nextch;
257 itor = next;
261 size_t len = term.size();
262 unsigned count = 0;
263 while ((ch = check_suffix(*itor))) {
264 if (++count > 3) {
265 term.resize(len);
266 break;
268 Unicode::append_utf8(term, ch);
269 if (++itor == Utf8Iterator()) goto endofterm;
271 // Don't index fish+chips as fish+ chips.
272 if (Unicode::is_wordchar(*itor))
273 term.resize(len);
276 endofterm:
277 if (!action(term, with_positions, itor.left()))
278 return;
282 void
283 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
284 string_view prefix, bool with_positions)
286 #ifndef USE_ICU
287 if (flags & FLAG_WORD_BREAKS) {
288 throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
289 "building Xapian to use ICU");
291 #endif
292 unsigned break_flags = flags & (FLAG_NGRAMS | FLAG_WORD_BREAKS);
293 if (break_flags == 0 && is_ngram_enabled()) {
294 break_flags = FLAG_NGRAMS;
297 stop_strategy current_stop_mode;
298 if (!stopper) {
299 current_stop_mode = TermGenerator::STOP_NONE;
300 } else {
301 current_stop_mode = stop_mode;
304 // Create two std::string objects which we effectively use as buffers to
305 // build the prefixed terms we generate (one for non-stemmed and one for
306 // stemmed terms). This is a simple way to avoid creating a temporary
307 // std::string object each time we generate a prefixed term.
308 string prefixed_term;
309 auto prefix_size = prefix.size();
310 prefixed_term.reserve(prefix_size + max_word_length);
311 prefixed_term.assign(prefix);
313 string prefixed_stemmed_term;
314 int add_z = (strategy != TermGenerator::STEM_ALL);
315 prefixed_stemmed_term.reserve(add_z + prefix_size + max_word_length);
316 if (add_z) prefixed_stemmed_term.assign(1, 'Z');
317 prefixed_stemmed_term.append(prefix);
318 auto prefixed_stemmed_size = prefixed_stemmed_term.size();
320 parse_terms(itor, break_flags, with_positions,
321 [=, &prefixed_term, &prefixed_stemmed_term
322 #if __cplusplus >= 201907L
323 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
324 // don't allow `this` here.
325 , this
326 #endif
327 ](const string & term, bool positional, size_t) {
328 if (term.size() > max_word_length) return true;
330 if (current_stop_mode == TermGenerator::STOP_ALL &&
331 (*stopper)(term)) {
332 return true;
335 if (strategy == TermGenerator::STEM_SOME ||
336 strategy == TermGenerator::STEM_NONE ||
337 strategy == TermGenerator::STEM_SOME_FULL_POS) {
338 prefixed_term.append(term);
339 if (positional) {
340 if (rare(cur_pos >= pos_limit))
341 throw Xapian::RangeError("termpos limit exceeded");
342 doc.add_posting(prefixed_term, ++cur_pos, wdf_inc);
343 } else {
344 doc.add_term(prefixed_term, wdf_inc);
346 prefixed_term.resize(prefix_size);
349 // MSVC seems to need "this->" on member variables in this
350 // situation.
351 if ((this->flags & FLAG_SPELLING) && prefix_size == 0)
352 db.add_spelling(term);
354 if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
355 return true;
357 if (strategy == TermGenerator::STEM_SOME ||
358 strategy == TermGenerator::STEM_SOME_FULL_POS) {
359 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
360 (*stopper)(term))
361 return true;
363 // Note, this uses the lowercased term, but that's OK as we
364 // only want to avoid stemming terms starting with a digit.
365 if (!should_stem(term)) return true;
368 // Add stemmed form without positional information.
369 const string& stem = stemmer(term);
370 if (rare(stem.empty())) return true;
371 prefixed_stemmed_term.append(stem);
372 if (strategy != TermGenerator::STEM_SOME && positional) {
373 if (strategy != TermGenerator::STEM_SOME_FULL_POS) {
374 if (rare(cur_pos >= pos_limit))
375 throw Xapian::RangeError("termpos limit exceeded");
376 ++cur_pos;
378 doc.add_posting(prefixed_stemmed_term, cur_pos, wdf_inc);
379 } else {
380 doc.add_term(prefixed_stemmed_term, wdf_inc);
382 prefixed_stemmed_term.resize(prefixed_stemmed_size);
384 return true;
388 struct Sniplet {
389 double* relevance;
391 size_t term_end;
393 size_t highlight;
395 Sniplet(double* r, size_t t, size_t h)
396 : relevance(r), term_end(t), highlight(h) { }
399 class SnipPipe {
400 deque<Sniplet> pipe;
401 deque<Sniplet> best_pipe;
403 // Requested length for snippet.
404 size_t length;
406 // Position in text of start of current pipe contents.
407 size_t begin = 0;
409 // Rolling sum of the current pipe contents.
410 double sum = 0;
412 size_t phrase_len = 0;
414 public:
415 size_t best_begin = 0;
417 size_t best_end = 0;
419 double best_sum = 0;
421 // Add one to length to allow for inter-word space.
422 // FIXME: We ought to correctly allow for multiple spaces.
423 explicit SnipPipe(size_t length_) : length(length_ + 1) { }
425 bool pump(double* r, size_t t, size_t h, unsigned flags);
427 void done();
429 bool drain(string_view input,
430 string_view hi_start,
431 string_view hi_end,
432 string_view omit,
433 string& output);
436 #define DECAY 2.0
438 inline bool
439 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
441 if (h > 1) {
442 if (pipe.size() >= h - 1) {
443 // The final term of a phrase is entering the window. Peg the
444 // phrase's relevance onto the first term of the phrase, so it'll
445 // be removed from `sum` when the phrase starts to leave the
446 // window.
447 auto & phrase_start = pipe[pipe.size() - (h - 1)];
448 if (phrase_start.relevance) {
449 *phrase_start.relevance *= DECAY;
450 sum -= *phrase_start.relevance;
452 sum += *r;
453 phrase_start.relevance = r;
454 phrase_start.highlight = h;
455 *r /= DECAY;
457 r = NULL;
458 h = 0;
460 pipe.emplace_back(r, t, h);
461 if (r) {
462 sum += *r;
463 *r /= DECAY;
466 // If necessary, discard words from the start of the pipe until it has the
467 // desired length.
468 // FIXME: Also shrink the window past words with relevance < 0?
469 while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
470 const Sniplet& word = pipe.front();
471 if (word.relevance) {
472 *word.relevance *= DECAY;
473 sum -= *word.relevance;
475 begin = word.term_end;
476 if (best_end >= begin)
477 best_pipe.push_back(word);
478 pipe.pop_front();
479 // E.g. can happen if the current term is longer than the requested
480 // length!
481 if (rare(pipe.empty())) break;
484 // Using > here doesn't work well, as we don't extend a snippet over terms
485 // with 0 weight.
486 if (sum >= best_sum) {
487 // Discard any part of `best_pipe` which is before `begin`.
488 if (begin >= best_end) {
489 best_pipe.clear();
490 } else {
491 while (!best_pipe.empty() &&
492 best_pipe.front().term_end <= begin) {
493 best_pipe.pop_front();
496 best_sum = sum;
497 best_begin = begin;
498 best_end = t;
499 } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
500 if (best_sum > 0 && best_end < begin) {
501 // We found something, and we aren't still looking near it.
502 // FIXME: Benchmark this and adjust if necessary.
503 return false;
506 return true;
509 inline void
510 SnipPipe::done()
512 // Discard any part of `pipe` which is after `best_end`.
513 if (begin >= best_end) {
514 pipe.clear();
515 } else {
516 // We should never empty the pipe (as that case should be handled
517 // above).
518 while (rare(!pipe.empty()) &&
519 pipe.back().term_end > best_end) {
520 pipe.pop_back();
525 // Check if a non-word character is should be included at the start of the
526 // snippet. We want to include certain leading non-word characters, but not
527 // others.
528 static inline bool
529 snippet_check_leading_nonwordchar(unsigned ch) {
530 if (Unicode::is_currency(ch) ||
531 Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
532 Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
533 return true;
535 switch (ch) {
536 case '"':
537 case '#':
538 case '%':
539 case '&':
540 case '\'':
541 case '+':
542 case '-':
543 case '/':
544 case '<':
545 case '@':
546 case '\\':
547 case '`':
548 case '~':
549 case 0x00A1: // INVERTED EXCLAMATION MARK
550 case 0x00A7: // SECTION SIGN
551 case 0x00BF: // INVERTED QUESTION MARK
552 return true;
554 return false;
557 // Check if a non-word character is should be included at the end of the
558 // snippet. We want to include certain trailing non-word characters, but not
559 // others.
560 static inline bool
561 snippet_check_trailing_nonwordchar(unsigned ch) {
562 if (Unicode::is_currency(ch) ||
563 Unicode::get_category(ch) == Unicode::CLOSE_PUNCTUATION ||
564 Unicode::get_category(ch) == Unicode::FINAL_QUOTE_PUNCTUATION) {
565 return true;
567 switch (ch) {
568 case '"':
569 case '%':
570 case '\'':
571 case '+':
572 case '-':
573 case '/':
574 case '>':
575 case '@':
576 case '\\':
577 case '`':
578 case '~':
579 return true;
581 return false;
584 static inline void
585 append_escaping_xml(const char* p, const char* end, string& output)
587 while (p != end) {
588 char ch = *p++;
589 switch (ch) {
590 case '&':
591 output += "&amp;";
592 break;
593 case '<':
594 output += "&lt;";
595 break;
596 case '>':
597 output += "&gt;";
598 break;
599 default:
600 output += ch;
605 inline bool
606 SnipPipe::drain(string_view input,
607 string_view hi_start,
608 string_view hi_end,
609 string_view omit,
610 string& output)
612 if (best_pipe.empty() && !pipe.empty()) {
613 swap(best_pipe, pipe);
616 if (best_pipe.empty()) {
617 size_t tail_len = input.size() - best_end;
618 if (tail_len == 0) return false;
620 // See if this is the end of a sentence.
621 // FIXME: This is quite simplistic - look at the Unicode rules:
622 // https://unicode.org/reports/tr29/#Sentence_Boundaries
623 bool sentence_end = false;
624 Utf8Iterator i(input.data() + best_end, tail_len);
625 while (i != Utf8Iterator()) {
626 unsigned ch = *i;
627 if (sentence_end && Unicode::is_whitespace(ch)) break;
629 // Allow "...", "!!", "!?!", etc...
630 sentence_end = (ch == '.' || ch == '?' || ch == '!');
632 if (Unicode::is_wordchar(ch)) break;
633 ++i;
636 if (sentence_end) {
637 // Include end of sentence punctuation.
638 append_escaping_xml(input.data() + best_end, i.raw(), output);
639 return false;
642 // Include trailing punctuation which includes meaning or context.
643 i.assign(input.data() + best_end, tail_len);
644 int trailing_punc = 0;
645 while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
646 // But limit how much trailing punctuation we include.
647 if (++trailing_punc > 4) {
648 trailing_punc = 0;
649 break;
651 ++i;
653 if (trailing_punc) {
654 append_escaping_xml(input.data() + best_end, i.raw(), output);
655 if (i == Utf8Iterator()) return false;
658 // Append "..." or equivalent as this doesn't seem to be the start
659 // of a sentence.
660 output += omit;
662 return false;
665 const Sniplet & word = best_pipe.front();
667 if (output.empty()) {
668 // Start of the snippet.
669 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
671 Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
672 while (i != Utf8Iterator()) {
673 unsigned ch = *i;
674 switch (sentence_boundary) {
675 case NO:
676 if (ch == '.' || ch == '?' || ch == '!') {
677 sentence_boundary = PUNC;
679 break;
680 case PUNC:
681 if (Unicode::is_whitespace(ch)) {
682 sentence_boundary = YES;
683 } else if (ch == '.' || ch == '?' || ch == '!') {
684 // Allow "...", "!!", "!?!", etc...
685 } else {
686 sentence_boundary = NO;
688 break;
689 case YES:
690 break;
693 // Start the snippet at the start of the first word, but include
694 // certain punctuation too.
695 if (Unicode::is_wordchar(ch)) {
696 // But limit how much leading punctuation we include.
697 size_t word_begin = i.raw() - input.data();
698 if (word_begin - best_begin > 4) {
699 best_begin = word_begin;
701 break;
703 ++i;
704 if (!snippet_check_leading_nonwordchar(ch)) {
705 best_begin = i.raw() - input.data();
709 // Add "..." or equivalent if this doesn't seem to be the start of a
710 // sentence.
711 if (sentence_boundary != YES) {
712 output += omit;
716 if (word.highlight) {
717 // Don't include inter-word characters in the highlight.
718 Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
719 while (i != Utf8Iterator()) {
720 unsigned ch = *i;
721 if (Unicode::is_wordchar(ch)) {
722 append_escaping_xml(input.data() + best_begin, i.raw(), output);
723 best_begin = i.raw() - input.data();
724 break;
726 ++i;
730 if (!phrase_len) {
731 phrase_len = word.highlight;
732 if (phrase_len) output += hi_start;
735 const char* p = input.data();
736 append_escaping_xml(p + best_begin, p + word.term_end, output);
737 best_begin = word.term_end;
739 if (phrase_len && --phrase_len == 0) output += hi_end;
741 best_pipe.pop_front();
742 return true;
745 static void
746 check_query(const Xapian::Query & query,
747 list<vector<string>> & exact_phrases,
748 unordered_map<string, double> & loose_terms,
749 list<const Xapian::Internal::QueryWildcard*> & wildcards,
750 list<const Xapian::Internal::QueryEditDistance*> & fuzzies,
751 size_t & longest_phrase)
753 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
754 size_t n_subqs = query.get_num_subqueries();
755 Xapian::Query::op op = query.get_type();
756 if (op == query.LEAF_TERM) {
757 const Xapian::Internal::QueryTerm & qt =
758 *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
759 loose_terms.insert(make_pair(qt.get_term(), 0));
760 } else if (op == query.OP_WILDCARD) {
761 using Xapian::Internal::QueryWildcard;
762 const QueryWildcard* qw =
763 static_cast<const QueryWildcard*>(query.internal.get());
764 wildcards.push_back(qw);
765 } else if (op == query.OP_EDIT_DISTANCE) {
766 using Xapian::Internal::QueryEditDistance;
767 const QueryEditDistance* qed =
768 static_cast<const QueryEditDistance*>(query.internal.get());
769 fuzzies.push_back(qed);
770 } else if (op == query.OP_PHRASE) {
771 const Xapian::Internal::QueryPhrase & phrase =
772 *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
773 if (phrase.get_window() == n_subqs) {
774 // Tight phrase.
775 for (size_t i = 0; i != n_subqs; ++i) {
776 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
777 goto non_term_subquery;
780 // Tight phrase of terms.
781 exact_phrases.push_back(vector<string>());
782 vector<string> & terms = exact_phrases.back();
783 terms.reserve(n_subqs);
784 for (size_t i = 0; i != n_subqs; ++i) {
785 Xapian::Query q = query.get_subquery(i);
786 const Xapian::Internal::QueryTerm & qt =
787 *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
788 terms.push_back(qt.get_term());
790 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
791 return;
794 non_term_subquery:
795 for (size_t i = 0; i != n_subqs; ++i)
796 check_query(query.get_subquery(i), exact_phrases, loose_terms,
797 wildcards, fuzzies, longest_phrase);
800 static double*
801 check_term(unordered_map<string, double> & loose_terms,
802 const Xapian::Weight::Internal * stats,
803 const string & term,
804 double max_tw)
806 auto it = loose_terms.find(term);
807 if (it == loose_terms.end()) return NULL;
809 if (it->second == 0.0) {
810 double relevance;
811 if (!stats->get_termweight(term, relevance)) {
812 // FIXME: Assert?
813 loose_terms.erase(it);
814 return NULL;
817 it->second = relevance + max_tw;
819 return &it->second;
822 string
823 MSet::Internal::snippet(string_view text,
824 size_t length,
825 const Xapian::Stem & stemmer,
826 unsigned flags,
827 string_view hi_start,
828 string_view hi_end,
829 string_view omit) const
831 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
832 // Too easy!
833 return string{text};
836 #ifndef USE_ICU
837 if (flags & MSet::SNIPPET_WORD_BREAKS) {
838 throw Xapian::FeatureUnavailableError("SNIPPET_WORD_BREAKS requires "
839 "building Xapian to use ICU");
841 #endif
842 auto SNIPPET_BREAK_MASK = MSet::SNIPPET_NGRAMS | MSet::SNIPPET_WORD_BREAKS;
843 unsigned break_flags = flags & SNIPPET_BREAK_MASK;
844 if (break_flags == 0 && is_ngram_enabled()) {
845 break_flags = MSet::SNIPPET_NGRAMS;
848 size_t term_start = 0;
849 double min_tw = 0, max_tw = 0;
850 if (stats) stats->get_max_termweight(min_tw, max_tw);
851 if (max_tw == 0.0) {
852 max_tw = 1.0;
853 } else {
854 // Scale up by (1 + 1/64) so that highlighting works better for terms
855 // with termweight 0 (which happens for terms not in the database, and
856 // also with some weighting schemes for terms which occur in almost all
857 // documents.
858 max_tw *= 1.015625;
861 Xapian::Query query;
862 if (enquire) {
863 query = enquire->query;
865 SnipPipe snip(length);
867 list<vector<string>> exact_phrases;
868 unordered_map<string, double> loose_terms;
869 list<const Xapian::Internal::QueryWildcard*> wildcards;
870 list<const Xapian::Internal::QueryEditDistance*> fuzzies;
871 size_t longest_phrase = 0;
872 check_query(query, exact_phrases, loose_terms,
873 wildcards, fuzzies, longest_phrase);
875 vector<double> exact_phrases_relevance;
876 exact_phrases_relevance.reserve(exact_phrases.size());
877 for (auto&& terms : exact_phrases) {
878 // FIXME: What relevance to use?
879 exact_phrases_relevance.push_back(max_tw * terms.size());
882 vector<double> wildcards_relevance;
883 wildcards_relevance.reserve(wildcards.size());
884 for (auto&& pattern : wildcards) {
885 // FIXME: What relevance to use?
886 (void)pattern;
887 wildcards_relevance.push_back(max_tw + min_tw);
890 vector<double> fuzzies_relevance;
891 fuzzies_relevance.reserve(fuzzies.size());
892 for (auto&& pattern : fuzzies) {
893 // FIXME: What relevance to use?
894 (void)pattern;
895 fuzzies_relevance.push_back(max_tw + min_tw);
898 // Background relevance is the same for a given MSet, so cache it
899 // between calls to MSet::snippet() on the same object.
900 unordered_map<string, double>& background = snippet_bg_relevance;
902 vector<string> phrase;
903 if (longest_phrase) phrase.resize(longest_phrase - 1);
904 size_t phrase_next = 0;
905 bool matchfound = false;
906 parse_terms(Utf8Iterator(text), break_flags, true,
907 [&](const string & term, bool positional, size_t left) {
908 // FIXME: Don't hardcode this here.
909 const size_t max_word_length = 64;
911 if (!positional) return true;
912 if (term.size() > max_word_length) return true;
914 // We get segments with any "inter-word" characters in front of
915 // each word, e.g.:
916 // [The][ cat][ sat][ on][ the][ mat]
917 size_t term_end = text.size() - left;
919 double* relevance = 0;
920 size_t highlight = 0;
921 if (stats) {
922 size_t i = 0;
923 for (auto&& terms : exact_phrases) {
924 if (term == terms.back()) {
925 size_t n = terms.size() - 1;
926 bool match = true;
927 while (UNSIGNED_OVERFLOW_OK(n--)) {
928 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
929 match = false;
930 break;
933 if (match) {
934 // FIXME: Sort phrases, highest score first!
935 relevance = &exact_phrases_relevance[i];
936 highlight = terms.size();
937 goto relevance_done;
940 ++i;
943 relevance = check_term(loose_terms, stats.get(), term, max_tw);
944 if (relevance) {
945 // Matched unstemmed term.
946 highlight = 1;
947 goto relevance_done;
950 string stem = "Z";
951 stem += stemmer(term);
952 relevance = check_term(loose_terms, stats.get(), stem, max_tw);
953 if (relevance) {
954 // Matched stemmed term.
955 highlight = 1;
956 goto relevance_done;
959 // Check wildcards.
960 // FIXME: Sort wildcards, cheapest to check first or something?
961 i = 0;
962 for (auto&& qw : wildcards) {
963 if (qw->test(term)) {
964 relevance = &wildcards_relevance[i];
965 highlight = 1;
966 goto relevance_done;
968 ++i;
971 // Check fuzzies.
972 // FIXME: Sort fuzzies, cheapest to check first or something?
973 i = 0;
974 for (auto&& qed : fuzzies) {
975 // test() returns 0 for no match, or edit_distance + 1.
976 int ed_result = qed->test(term);
977 if (ed_result) {
978 // FIXME: Reduce relevance the more edits there are?
979 // We can't just divide by ed_result here as this
980 // relevance is used by any term matching this
981 // subquery.
982 relevance = &fuzzies_relevance[i];
983 highlight = 1;
984 goto relevance_done;
986 ++i;
989 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
990 // Background document model.
991 auto bgit = background.find(term);
992 if (bgit == background.end()) bgit = background.find(stem);
993 if (bgit == background.end()) {
994 Xapian::doccount tf = enquire->db.get_termfreq(term);
995 if (!tf) {
996 tf = enquire->db.get_termfreq(stem);
997 } else {
998 stem = term;
1000 double r = 0.0;
1001 if (tf) {
1002 // Add one to avoid log(0) when a term indexes all
1003 // documents.
1004 Xapian::doccount num_docs = stats->collection_size + 1;
1005 r = max_tw * log((num_docs - tf) / double(tf));
1006 r /= (length + 1) * log(double(num_docs));
1007 #if 0
1008 if (r <= 0) {
1009 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
1010 while (i != Utf8Iterator()) {
1011 if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
1012 r = max_tw * 0.05;
1016 #endif
1018 bgit = background.emplace(make_pair(stem, r)).first;
1020 relevance = &bgit->second;
1022 } else {
1023 #if 0
1024 // In the absence of weight information, assume longer terms
1025 // are more relevant, and that unstemmed matches are a bit more
1026 // relevant than stemmed matches.
1027 if (queryterms.find(term) != queryterms.end()) {
1028 relevance = term.size() * 3;
1029 } else {
1030 string stem = "Z";
1031 stem += stemmer(term);
1032 if (queryterms.find(stem) != queryterms.end()) {
1033 relevance = term.size() * 2;
1036 #endif
1039 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
1040 // used if you don't want the collection model?
1042 #if 0
1043 // FIXME: Punctuation should somehow be included in the model, but this
1044 // approach is problematic - we don't want the first word of a sentence
1045 // to be favoured when it's at the end of the window.
1047 // Give first word in each sentence a relevance boost.
1048 if (term_start == 0) {
1049 relevance += 10;
1050 } else {
1051 for (size_t i = term_start; i + term.size() < term_end; ++i) {
1052 if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
1053 relevance += 10;
1054 break;
1058 #endif
1060 relevance_done:
1061 if (longest_phrase) {
1062 phrase[phrase_next] = term;
1063 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
1066 if (highlight) matchfound = true;
1068 if (!snip.pump(relevance, term_end, highlight, flags)) return false;
1070 term_start = term_end;
1071 return true;
1074 snip.done();
1076 // Put together the snippet.
1077 string result;
1078 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
1079 while (snip.drain(text, hi_start, hi_end, omit, result)) { }
1082 return result;