Add regression test for previous commit
[xapian.git] / xapian-core / queryparser / termgenerator_internal.cc
blob470131c7a25d6830e276f2f0e0d398f00c04cd8e
1 /** @file
2 * @brief TermGenerator class internals
3 */
4 /* Copyright (C) 2007,2010,2011,2012,2015,2016,2017,2018,2019,2020 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "termgenerator_internal.h"
25 #include "api/omenquireinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
35 #include <algorithm>
36 #include <cmath>
37 #include <deque>
38 #include <limits>
39 #include <list>
40 #include <string>
41 #include <unordered_map>
42 #include <vector>
44 #include "cjk-tokenizer.h"
46 using namespace std;
48 namespace Xapian {
50 static inline bool
51 U_isupper(unsigned ch)
53 return ch < 128 && C_isupper(static_cast<unsigned char>(ch));
56 static inline unsigned
57 check_wordchar(unsigned ch)
59 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
60 return 0;
63 static inline bool
64 should_stem(const std::string & term)
66 const unsigned int SHOULD_STEM_MASK =
67 (1 << Unicode::LOWERCASE_LETTER) |
68 (1 << Unicode::TITLECASE_LETTER) |
69 (1 << Unicode::MODIFIER_LETTER) |
70 (1 << Unicode::OTHER_LETTER);
71 Utf8Iterator u(term);
72 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
75 /** Value representing "ignore this" when returned by check_infix() or
76 * check_infix_digit().
78 static const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
80 static inline unsigned
81 check_infix(unsigned ch)
83 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
84 // Unicode includes all these except '&' in its word boundary rules,
85 // as well as 0x2019 (which we handle below) and ':' (for Swedish
86 // apparently, but we ignore this for now as it's problematic in
87 // real world cases).
88 return ch;
90 // 0x2019 is Unicode apostrophe and single closing quote.
91 // 0x201b is Unicode single opening quote with the tail rising.
92 if (ch == 0x2019 || ch == 0x201b) return '\'';
93 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
94 return UNICODE_IGNORE;
95 return 0;
98 static inline unsigned
99 check_infix_digit(unsigned ch)
101 // This list of characters comes from Unicode's word identifying algorithm.
102 switch (ch) {
103 case ',':
104 case '.':
105 case ';':
106 case 0x037e: // GREEK QUESTION MARK
107 case 0x0589: // ARMENIAN FULL STOP
108 case 0x060D: // ARABIC DATE SEPARATOR
109 case 0x07F8: // NKO COMMA
110 case 0x2044: // FRACTION SLASH
111 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
112 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
113 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
114 return ch;
116 if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
117 return UNICODE_IGNORE;
118 return 0;
121 static inline bool
122 is_digit(unsigned ch) {
123 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
126 static inline unsigned
127 check_suffix(unsigned ch)
129 if (ch == '+' || ch == '#') return ch;
130 // FIXME: what about '-'?
131 return 0;
134 /** Templated framework for processing terms.
136 * Calls action(term, positional) for each term to add, where term is a
137 * std::string holding the term, and positional is a bool indicating
138 * if this term carries positional information.
140 template<typename ACTION>
141 static void
142 parse_terms(Utf8Iterator itor, bool cjk_ngram, bool with_positions, ACTION action)
144 while (true) {
145 // Advance to the start of the next term.
146 unsigned ch;
147 while (true) {
148 if (itor == Utf8Iterator()) return;
149 ch = check_wordchar(*itor);
150 if (ch) break;
151 ++itor;
154 string term;
155 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
156 // Don't worry if there's a trailing '.' or not.
157 if (U_isupper(*itor)) {
158 const Utf8Iterator end;
159 Utf8Iterator p = itor;
160 do {
161 Unicode::append_utf8(term, Unicode::tolower(*p++));
162 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
163 // One letter does not make an acronym! If we handled a single
164 // uppercase letter here, we wouldn't catch M&S below.
165 if (term.size() > 1) {
166 // Check there's not a (lower case) letter or digit
167 // immediately after it.
168 if (p == end || !Unicode::is_wordchar(*p)) {
169 itor = p;
170 goto endofterm;
173 term.resize(0);
176 while (true) {
177 if (cjk_ngram &&
178 CJK::codepoint_is_cjk(*itor) &&
179 Unicode::is_wordchar(*itor)) {
180 CJKTokenIterator tk(itor);
181 while (tk != CJKTokenIterator()) {
182 const string& cjk_token = *tk;
183 if (!action(cjk_token, with_positions && tk.unigram(),
184 tk.get_utf8iterator()))
185 return;
186 ++tk;
188 // Update itor to end of CJK text span.
189 itor = tk.get_utf8iterator();
190 while (true) {
191 if (itor == Utf8Iterator()) return;
192 ch = check_wordchar(*itor);
193 if (ch) break;
194 ++itor;
196 continue;
198 unsigned prevch;
199 do {
200 Unicode::append_utf8(term, ch);
201 prevch = ch;
202 if (++itor == Utf8Iterator() ||
203 (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
204 goto endofterm;
205 ch = check_wordchar(*itor);
206 } while (ch);
208 Utf8Iterator next(itor);
209 ++next;
210 if (next == Utf8Iterator()) break;
211 unsigned nextch = check_wordchar(*next);
212 if (!nextch) break;
213 unsigned infix_ch = *itor;
214 if (is_digit(prevch) && is_digit(*next)) {
215 infix_ch = check_infix_digit(infix_ch);
216 } else {
217 // Handle things like '&' in AT&T, apostrophes, etc.
218 infix_ch = check_infix(infix_ch);
220 if (!infix_ch) break;
221 if (infix_ch != UNICODE_IGNORE)
222 Unicode::append_utf8(term, infix_ch);
223 ch = nextch;
224 itor = next;
228 size_t len = term.size();
229 unsigned count = 0;
230 while ((ch = check_suffix(*itor))) {
231 if (++count > 3) {
232 term.resize(len);
233 break;
235 Unicode::append_utf8(term, ch);
236 if (++itor == Utf8Iterator()) goto endofterm;
238 // Don't index fish+chips as fish+ chips.
239 if (Unicode::is_wordchar(*itor))
240 term.resize(len);
243 endofterm:
244 if (!action(term, with_positions, itor))
245 return;
249 void
250 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc,
251 const string & prefix, bool with_positions)
253 bool cjk_ngram = (flags & FLAG_CJK_NGRAM) || CJK::is_cjk_enabled();
255 stop_strategy current_stop_mode;
256 if (!stopper.get()) {
257 current_stop_mode = TermGenerator::STOP_NONE;
258 } else {
259 current_stop_mode = stop_mode;
262 parse_terms(itor, cjk_ngram, with_positions,
264 #if __cplusplus >= 201907L
265 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
266 // don't allow `this` here.
267 , this
268 #endif
269 ](const string & term, bool positional, const Utf8Iterator &) {
270 if (term.size() > max_word_length) return true;
272 if (current_stop_mode == TermGenerator::STOP_ALL &&
273 (*stopper)(term)) {
274 return true;
277 if (strategy == TermGenerator::STEM_SOME ||
278 strategy == TermGenerator::STEM_NONE ||
279 strategy == TermGenerator::STEM_SOME_FULL_POS) {
280 if (positional) {
281 doc.add_posting(prefix + term, ++cur_pos, wdf_inc);
282 } else {
283 doc.add_term(prefix + term, wdf_inc);
287 // MSVC seems to need "this->" on member variables in this
288 // situation.
289 if ((this->flags & FLAG_SPELLING) && prefix.empty())
290 db.add_spelling(term);
292 if (strategy == TermGenerator::STEM_NONE || stemmer.is_none())
293 return true;
295 if (strategy == TermGenerator::STEM_SOME ||
296 strategy == TermGenerator::STEM_SOME_FULL_POS) {
297 if (current_stop_mode == TermGenerator::STOP_STEMMED &&
298 (*stopper)(term))
299 return true;
301 // Note, this uses the lowercased term, but that's OK as we
302 // only want to avoid stemming terms starting with a digit.
303 if (!should_stem(term)) return true;
306 // Add stemmed form without positional information.
307 const string& stem = stemmer(term);
308 if (rare(stem.empty())) return true;
309 string stemmed_term;
310 if (strategy != TermGenerator::STEM_ALL) {
311 stemmed_term += "Z";
313 stemmed_term += prefix;
314 stemmed_term += stem;
315 if (strategy != TermGenerator::STEM_SOME && with_positions) {
316 if (strategy != TermGenerator::STEM_SOME_FULL_POS) ++cur_pos;
317 doc.add_posting(stemmed_term, cur_pos, wdf_inc);
318 } else {
319 doc.add_term(stemmed_term, wdf_inc);
321 return true;
325 struct Sniplet {
326 double* relevance;
328 size_t term_end;
330 size_t highlight;
332 Sniplet(double* r, size_t t, size_t h)
333 : relevance(r), term_end(t), highlight(h) { }
336 class SnipPipe {
337 deque<Sniplet> pipe;
338 deque<Sniplet> best_pipe;
340 // Requested length for snippet.
341 size_t length;
343 // Position in text of start of current pipe contents.
344 size_t begin = 0;
346 // Rolling sum of the current pipe contents.
347 double sum = 0;
349 size_t phrase_len = 0;
351 public:
352 size_t best_begin = 0;
354 size_t best_end = 0;
356 double best_sum = 0;
358 // Add one to length to allow for inter-word space.
359 // FIXME: We ought to correctly allow for multiple spaces.
360 explicit SnipPipe(size_t length_) : length(length_ + 1) { }
362 bool pump(double* r, size_t t, size_t h, unsigned flags);
364 void done();
366 bool drain(const string & input,
367 const string & hi_start,
368 const string & hi_end,
369 const string & omit,
370 string & output);
373 #define DECAY 2.0
375 inline bool
376 SnipPipe::pump(double* r, size_t t, size_t h, unsigned flags)
378 if (h > 1) {
379 if (pipe.size() >= h - 1) {
380 // The final term of a phrase is entering the window. Peg the
381 // phrase's relevance onto the first term of the phrase, so it'll
382 // be removed from `sum` when the phrase starts to leave the
383 // window.
384 auto & phrase_start = pipe[pipe.size() - (h - 1)];
385 if (phrase_start.relevance) {
386 *phrase_start.relevance *= DECAY;
387 sum -= *phrase_start.relevance;
389 sum += *r;
390 phrase_start.relevance = r;
391 phrase_start.highlight = h;
392 *r /= DECAY;
394 r = NULL;
395 h = 0;
397 pipe.emplace_back(r, t, h);
398 if (r) {
399 sum += *r;
400 *r /= DECAY;
403 // If necessary, discard words from the start of the pipe until it has the
404 // desired length.
405 // FIXME: Also shrink the window past words with relevance < 0?
406 while (t - begin > length /* || pipe.front().relevance < 0.0 */) {
407 const Sniplet& word = pipe.front();
408 if (word.relevance) {
409 *word.relevance *= DECAY;
410 sum -= *word.relevance;
412 begin = word.term_end;
413 if (best_end >= begin)
414 best_pipe.push_back(word);
415 pipe.pop_front();
416 // E.g. can happen if the current term is longer than the requested
417 // length!
418 if (rare(pipe.empty())) break;
421 // Using > here doesn't work well, as we don't extend a snippet over terms
422 // with 0 weight.
423 if (sum >= best_sum) {
424 // Discard any part of `best_pipe` which is before `begin`.
425 if (begin >= best_end) {
426 best_pipe.clear();
427 } else {
428 while (!best_pipe.empty() &&
429 best_pipe.front().term_end <= begin) {
430 best_pipe.pop_front();
433 best_sum = sum;
434 best_begin = begin;
435 best_end = t;
436 } else if ((flags & Xapian::MSet::SNIPPET_EXHAUSTIVE) == 0) {
437 if (best_sum > 0 && best_end < begin) {
438 // We found something, and we aren't still looking near it.
439 // FIXME: Benchmark this and adjust if necessary.
440 return false;
443 return true;
446 inline void
447 SnipPipe::done()
449 // Discard any part of `pipe` which is after `best_end`.
450 if (begin >= best_end) {
451 pipe.clear();
452 } else {
453 // We should never empty the pipe (as that case should be handled
454 // above).
455 while (rare(!pipe.empty()) &&
456 pipe.back().term_end > best_end) {
457 pipe.pop_back();
462 // Check if a non-word character is should be included at the start of the
463 // snippet. We want to include certain leading non-word characters, but not
464 // others.
465 static inline bool
466 snippet_check_leading_nonwordchar(unsigned ch) {
467 if (Unicode::is_currency(ch) ||
468 Unicode::get_category(ch) == Unicode::OPEN_PUNCTUATION ||
469 Unicode::get_category(ch) == Unicode::INITIAL_QUOTE_PUNCTUATION) {
470 return true;
472 switch (ch) {
473 case '"':
474 case '#':
475 case '%':
476 case '&':
477 case '\'':
478 case '+':
479 case '-':
480 case '/':
481 case '<':
482 case '@':
483 case '\\':
484 case '`':
485 case '~':
486 case 0x00A1: // INVERTED EXCLAMATION MARK
487 case 0x00A7: // SECTION SIGN
488 case 0x00BF: // INVERTED QUESTION MARK
489 return true;
491 return false;
494 // Check if a non-word character is should be included at the end of the
495 // snippet. We want to include certain trailing non-word characters, but not
496 // others.
497 static inline bool
498 snippet_check_trailing_nonwordchar(unsigned ch) {
499 if (Unicode::is_currency(ch) ||
500 Unicode::get_category(ch) == Unicode::CLOSE_PUNCTUATION ||
501 Unicode::get_category(ch) == Unicode::FINAL_QUOTE_PUNCTUATION) {
502 return true;
504 switch (ch) {
505 case '"':
506 case '%':
507 case '\'':
508 case '+':
509 case '-':
510 case '/':
511 case '>':
512 case '@':
513 case '\\':
514 case '`':
515 case '~':
516 return true;
518 return false;
521 static inline void
522 append_escaping_xml(const char* p, const char* end, string& output)
524 while (p != end) {
525 char ch = *p++;
526 switch (ch) {
527 case '&':
528 output += "&amp;";
529 break;
530 case '<':
531 output += "&lt;";
532 break;
533 case '>':
534 output += "&gt;";
535 break;
536 default:
537 output += ch;
542 inline bool
543 SnipPipe::drain(const string & input,
544 const string & hi_start,
545 const string & hi_end,
546 const string & omit,
547 string & output)
549 if (best_pipe.empty() && !pipe.empty()) {
550 swap(best_pipe, pipe);
553 if (best_pipe.empty()) {
554 size_t tail_len = input.size() - best_end;
555 if (tail_len == 0) return false;
557 // See if this is the end of a sentence.
558 // FIXME: This is quite simplistic - look at the Unicode rules:
559 // https://unicode.org/reports/tr29/#Sentence_Boundaries
560 bool sentence_end = false;
561 Utf8Iterator i(input.data() + best_end, tail_len);
562 while (i != Utf8Iterator()) {
563 unsigned ch = *i;
564 if (sentence_end && Unicode::is_whitespace(ch)) break;
566 // Allow "...", "!!", "!?!", etc...
567 sentence_end = (ch == '.' || ch == '?' || ch == '!');
569 if (Unicode::is_wordchar(ch)) break;
570 ++i;
573 if (sentence_end) {
574 // Include end of sentence punctuation.
575 append_escaping_xml(input.data() + best_end, i.raw(), output);
576 return false;
579 // Include trailing punctuation which includes meaning or context.
580 i.assign(input.data() + best_end, tail_len);
581 int trailing_punc = 0;
582 while (i != Utf8Iterator() && snippet_check_trailing_nonwordchar(*i)) {
583 // But limit how much trailing punctuation we include.
584 if (++trailing_punc > 4) {
585 trailing_punc = 0;
586 break;
588 ++i;
590 if (trailing_punc) {
591 append_escaping_xml(input.data() + best_end, i.raw(), output);
592 if (i == Utf8Iterator()) return false;
595 // Append "..." or equivalent as this doesn't seem to be the start
596 // of a sentence.
597 output += omit;
599 return false;
602 const Sniplet & word = best_pipe.front();
604 if (output.empty()) {
605 // Start of the snippet.
606 enum { NO, PUNC, YES } sentence_boundary = (best_begin == 0) ? YES : NO;
608 Utf8Iterator i(input.data() + best_begin, word.term_end - best_begin);
609 while (i != Utf8Iterator()) {
610 unsigned ch = *i;
611 switch (sentence_boundary) {
612 case NO:
613 if (ch == '.' || ch == '?' || ch == '!') {
614 sentence_boundary = PUNC;
616 break;
617 case PUNC:
618 if (Unicode::is_whitespace(ch)) {
619 sentence_boundary = YES;
620 } else if (ch == '.' || ch == '?' || ch == '!') {
621 // Allow "...", "!!", "!?!", etc...
622 } else {
623 sentence_boundary = NO;
625 break;
626 case YES:
627 break;
630 // Start the snippet at the start of the first word, but include
631 // certain punctuation too.
632 if (Unicode::is_wordchar(ch)) {
633 // But limit how much leading punctuation we include.
634 size_t word_begin = i.raw() - input.data();
635 if (word_begin - best_begin > 4) {
636 best_begin = word_begin;
638 break;
640 ++i;
641 if (!snippet_check_leading_nonwordchar(ch)) {
642 best_begin = i.raw() - input.data();
646 // Add "..." or equivalent if this doesn't seem to be the start of a
647 // sentence.
648 if (sentence_boundary != YES) {
649 output += omit;
653 if (word.highlight) {
654 // Don't include inter-word characters in the highlight.
655 Utf8Iterator i(input.data() + best_begin, input.size() - best_begin);
656 while (i != Utf8Iterator()) {
657 unsigned ch = *i;
658 if (Unicode::is_wordchar(ch)) {
659 append_escaping_xml(input.data() + best_begin, i.raw(), output);
660 best_begin = i.raw() - input.data();
661 break;
663 ++i;
667 if (!phrase_len) {
668 phrase_len = word.highlight;
669 if (phrase_len) output += hi_start;
672 const char* p = input.data();
673 append_escaping_xml(p + best_begin, p + word.term_end, output);
674 best_begin = word.term_end;
676 if (phrase_len && --phrase_len == 0) output += hi_end;
678 best_pipe.pop_front();
679 return true;
682 static void
683 check_query(const Xapian::Query & query,
684 list<vector<string>> & exact_phrases,
685 unordered_map<string, double> & loose_terms,
686 list<string> & wildcards,
687 size_t & longest_phrase)
689 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
690 size_t n_subqs = query.get_num_subqueries();
691 Xapian::Query::op op = query.get_type();
692 if (op == query.LEAF_TERM) {
693 const Xapian::Internal::QueryTerm & qt =
694 *static_cast<const Xapian::Internal::QueryTerm *>(query.internal.get());
695 loose_terms.insert(make_pair(qt.get_term(), 0));
696 } else if (op == query.OP_WILDCARD) {
697 const Xapian::Internal::QueryWildcard & qw =
698 *static_cast<const Xapian::Internal::QueryWildcard *>(query.internal.get());
699 wildcards.push_back(qw.get_pattern());
700 } else if (op == query.OP_PHRASE) {
701 const Xapian::Internal::QueryPhrase & phrase =
702 *static_cast<const Xapian::Internal::QueryPhrase *>(query.internal.get());
703 if (phrase.get_window() == n_subqs) {
704 // Tight phrase.
705 for (size_t i = 0; i != n_subqs; ++i) {
706 if (query.get_subquery(i).get_type() != query.LEAF_TERM)
707 goto non_term_subquery;
710 // Tight phrase of terms.
711 exact_phrases.push_back(vector<string>());
712 vector<string> & terms = exact_phrases.back();
713 terms.reserve(n_subqs);
714 for (size_t i = 0; i != n_subqs; ++i) {
715 Xapian::Query q = query.get_subquery(i);
716 const Xapian::Internal::QueryTerm & qt =
717 *static_cast<const Xapian::Internal::QueryTerm *>(q.internal.get());
718 terms.push_back(qt.get_term());
720 if (n_subqs > longest_phrase) longest_phrase = n_subqs;
721 return;
724 non_term_subquery:
725 for (size_t i = 0; i != n_subqs; ++i)
726 check_query(query.get_subquery(i), exact_phrases, loose_terms,
727 wildcards, longest_phrase);
730 static double*
731 check_term(unordered_map<string, double> & loose_terms,
732 const Xapian::Weight::Internal * stats,
733 const string & term,
734 double max_tw)
736 auto it = loose_terms.find(term);
737 if (it == loose_terms.end()) return NULL;
739 if (it->second == 0.0) {
740 double relevance;
741 if (!stats->get_termweight(term, relevance)) {
742 // FIXME: Assert?
743 loose_terms.erase(it);
744 return NULL;
747 it->second = relevance + max_tw;
749 return &it->second;
752 string
753 MSet::Internal::snippet(const string & text,
754 size_t length,
755 const Xapian::Stem & stemmer,
756 unsigned flags,
757 const string & hi_start,
758 const string & hi_end,
759 const string & omit) const
761 if (hi_start.empty() && hi_end.empty() && text.size() <= length) {
762 // Too easy!
763 return text;
766 bool cjk_ngram = (flags & MSet::SNIPPET_CJK_NGRAM);
767 if (!cjk_ngram) {
768 cjk_ngram = CJK::is_cjk_enabled();
771 size_t term_start = 0;
772 double min_tw = 0, max_tw = 0;
773 if (stats) stats->get_max_termweight(min_tw, max_tw);
774 if (max_tw == 0.0) {
775 max_tw = 1.0;
776 } else {
777 // Scale up by (1 + 1/64) so that highlighting works better for terms
778 // with termweight 0 (which happens for terms not in the database, and
779 // also with some weighting schemes for terms which occur in almost all
780 // documents.
781 max_tw *= 1.015625;
784 Xapian::Query query;
785 if (enquire.get()) {
786 query = enquire->query;
788 SnipPipe snip(length);
790 list<vector<string>> exact_phrases;
791 unordered_map<string, double> loose_terms;
792 list<string> wildcards;
793 size_t longest_phrase = 0;
794 check_query(query, exact_phrases, loose_terms,
795 wildcards, longest_phrase);
797 vector<double> exact_phrases_relevance;
798 exact_phrases_relevance.reserve(exact_phrases.size());
799 for (auto&& terms : exact_phrases) {
800 // FIXME: What relevance to use?
801 exact_phrases_relevance.push_back(max_tw * terms.size());
804 vector<double> wildcards_relevance;
805 wildcards_relevance.reserve(exact_phrases.size());
806 for (auto&& pattern : wildcards) {
807 // FIXME: What relevance to use?
808 (void)pattern;
809 wildcards_relevance.push_back(max_tw + min_tw);
812 // Background relevance is the same for a given MSet, so cache it
813 // between calls to MSet::snippet() on the same object.
814 unordered_map<string, double>& background = snippet_bg_relevance;
816 vector<string> phrase;
817 if (longest_phrase) phrase.resize(longest_phrase - 1);
818 size_t phrase_next = 0;
819 bool matchfound = false;
820 parse_terms(Utf8Iterator(text), cjk_ngram, true,
821 [&](const string & term, bool positional, const Utf8Iterator & it) {
822 // FIXME: Don't hardcode this here.
823 const size_t max_word_length = 64;
825 if (!positional) return true;
826 if (term.size() > max_word_length) return true;
828 // We get segments with any "inter-word" characters in front of
829 // each word, e.g.:
830 // [The][ cat][ sat][ on][ the][ mat]
831 size_t term_end = text.size() - it.left();
833 double* relevance = 0;
834 size_t highlight = 0;
835 if (stats) {
836 size_t i = 0;
837 for (auto&& terms : exact_phrases) {
838 if (term == terms.back()) {
839 size_t n = terms.size() - 1;
840 bool match = true;
841 while (n--) {
842 if (terms[n] != phrase[(n + phrase_next) % (longest_phrase - 1)]) {
843 match = false;
844 break;
847 if (match) {
848 // FIXME: Sort phrases, highest score first!
849 relevance = &exact_phrases_relevance[i];
850 highlight = terms.size();
851 goto relevance_done;
854 ++i;
857 relevance = check_term(loose_terms, stats, term, max_tw);
858 if (relevance) {
859 // Matched unstemmed term.
860 highlight = 1;
861 goto relevance_done;
864 string stem = "Z";
865 stem += stemmer(term);
866 relevance = check_term(loose_terms, stats, stem, max_tw);
867 if (relevance) {
868 // Matched stemmed term.
869 highlight = 1;
870 goto relevance_done;
873 // Check wildcards.
874 // FIXME: Sort wildcards, shortest pattern first or something?
875 i = 0;
876 for (auto&& pattern : wildcards) {
877 if (startswith(term, pattern)) {
878 relevance = &wildcards_relevance[i];
879 highlight = 1;
880 goto relevance_done;
882 ++i;
885 if (flags & Xapian::MSet::SNIPPET_BACKGROUND_MODEL) {
886 // Background document model.
887 auto bgit = background.find(term);
888 if (bgit == background.end()) bgit = background.find(stem);
889 if (bgit == background.end()) {
890 Xapian::doccount tf = enquire->db.get_termfreq(term);
891 if (!tf) {
892 tf = enquire->db.get_termfreq(stem);
893 } else {
894 stem = term;
896 double r = 0.0;
897 if (tf) {
898 // Add one to avoid log(0) when a term indexes all
899 // documents.
900 Xapian::doccount num_docs = stats->collection_size + 1;
901 r = max_tw * log((num_docs - tf) / double(tf));
902 r /= (length + 1) * log(double(num_docs));
903 #if 0
904 if (r <= 0) {
905 Utf8Iterator i(text.data() + term_start, text.data() + term_end);
906 while (i != Utf8Iterator()) {
907 if (Unicode::get_category(*i++) == Unicode::UPPERCASE_LETTER) {
908 r = max_tw * 0.05;
912 #endif
914 bgit = background.emplace(make_pair(stem, r)).first;
916 relevance = &bgit->second;
918 } else {
919 #if 0
920 // In the absence of weight information, assume longer terms
921 // are more relevant, and that unstemmed matches are a bit more
922 // relevant than stemmed matches.
923 if (queryterms.find(term) != queryterms.end()) {
924 relevance = term.size() * 3;
925 } else {
926 string stem = "Z";
927 stem += stemmer(term);
928 if (queryterms.find(stem) != queryterms.end()) {
929 relevance = term.size() * 2;
932 #endif
935 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
936 // used if you don't want the collection model?
938 #if 0
939 // FIXME: Punctuation should somehow be included in the model, but this
940 // approach is problematic - we don't want the first word of a sentence
941 // to be favoured when it's at the end of the window.
943 // Give first word in each sentence a relevance boost.
944 if (term_start == 0) {
945 relevance += 10;
946 } else {
947 for (size_t i = term_start; i + term.size() < term_end; ++i) {
948 if (text[i] == '.' && Unicode::is_whitespace(text[i + 1])) {
949 relevance += 10;
950 break;
954 #endif
956 relevance_done:
957 if (longest_phrase) {
958 phrase[phrase_next] = term;
959 phrase_next = (phrase_next + 1) % (longest_phrase - 1);
962 if (highlight) matchfound = true;
964 if (!snip.pump(relevance, term_end, highlight, flags)) return false;
966 term_start = term_end;
967 return true;
970 snip.done();
972 // Put together the snippet.
973 string result;
974 if (matchfound || (flags & SNIPPET_EMPTY_WITHOUT_MATCH) == 0) {
975 while (snip.drain(text, hi_start, hi_end, omit, result)) { }
978 return result;