2 * @brief TermGenerator class internals
4 /* Copyright (C) 2007-2024 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "termgenerator_internal.h"
25 #include "api/msetinternal.h"
26 #include "api/queryinternal.h"
28 #include <xapian/document.h>
29 #include <xapian/queryparser.h>
30 #include <xapian/stem.h>
31 #include <xapian/unicode.h>
33 #include "stringutils.h"
41 #include <string_view>
42 #include <unordered_map>
45 #include "word-breaker.h"
52 U_isupper(unsigned ch
)
54 return ch
< 128 && C_isupper(static_cast<unsigned char>(ch
));
57 static inline unsigned
58 check_wordchar(unsigned ch
)
60 if (Unicode::is_wordchar(ch
)) return Unicode::tolower(ch
);
65 should_stem(const std::string
& term
)
67 const unsigned int SHOULD_STEM_MASK
=
68 (1 << Unicode::LOWERCASE_LETTER
) |
69 (1 << Unicode::TITLECASE_LETTER
) |
70 (1 << Unicode::MODIFIER_LETTER
) |
71 (1 << Unicode::OTHER_LETTER
);
73 return ((SHOULD_STEM_MASK
>> Unicode::get_category(*u
)) & 1);
76 /** Value representing "ignore this" when returned by check_infix() or
77 * check_infix_digit().
79 static const unsigned UNICODE_IGNORE
= numeric_limits
<unsigned>::max();
81 static inline unsigned
82 check_infix(unsigned ch
)
84 if (ch
== '\'' || ch
== '&' || ch
== 0xb7 || ch
== 0x5f4 || ch
== 0x2027) {
85 // Unicode includes all these except '&' in its word boundary rules,
86 // as well as 0x2019 (which we handle below) and ':' (for Swedish
87 // apparently, but we ignore this for now as it's problematic in
91 // 0x2019 is Unicode apostrophe and single closing quote.
92 // 0x201b is Unicode single opening quote with the tail rising.
93 if (ch
== 0x2019 || ch
== 0x201b) return '\'';
94 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
95 return UNICODE_IGNORE
;
99 static inline unsigned
100 check_infix_digit(unsigned ch
)
102 // This list of characters comes from Unicode's word identifying algorithm.
107 case 0x037e: // GREEK QUESTION MARK
108 case 0x0589: // ARMENIAN FULL STOP
109 case 0x060D: // ARABIC DATE SEPARATOR
110 case 0x07F8: // NKO COMMA
111 case 0x2044: // FRACTION SLASH
112 case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
113 case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
114 case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
117 if (ch
>= 0x200b && (ch
<= 0x200d || ch
== 0x2060 || ch
== 0xfeff))
118 return UNICODE_IGNORE
;
123 is_digit(unsigned ch
) {
124 return (Unicode::get_category(ch
) == Unicode::DECIMAL_DIGIT_NUMBER
);
127 static inline unsigned
128 check_suffix(unsigned ch
)
130 if (ch
== '+' || ch
== '#') return ch
;
131 // FIXME: what about '-'?
135 static_assert(int(MSet::SNIPPET_WORD_BREAKS
) == TermGenerator::FLAG_WORD_BREAKS
,
136 "WORD_BREAKS flags have same value");
138 template<typename ACTION
>
140 break_words(Utf8Iterator
& itor
, unsigned break_flags
, bool with_positions
,
144 if (break_flags
& MSet::SNIPPET_WORD_BREAKS
) {
145 const char* start
= itor
.raw();
146 // get_unbroken() returns the number of codepoints, which we aren't
147 // interested in here.
148 (void)get_unbroken(itor
);
149 size_t left
= itor
.raw() - start
;
150 for (WordIterator
tk(start
, left
); tk
!= WordIterator(); ++tk
) {
151 const string
& token
= *tk
;
152 left
-= token
.length();
153 if (!action(token
, with_positions
, itor
.left() + left
))
162 NgramIterator
tk(itor
);
163 while (tk
!= NgramIterator()) {
164 const string
& token
= *tk
;
165 // FLAG_NGRAMS only sets positions for tokens of length 1.
166 bool with_pos
= with_positions
&& tk
.unigram();
167 if (!action(token
, with_pos
, tk
.get_utf8iterator().left()))
171 // Update itor to point the end of the span of text in an unbroken script.
172 itor
= tk
.get_utf8iterator();
176 /** Templated framework for processing terms.
178 * Calls action(term, positional) for each term to add, where term is a
179 * std::string holding the term, and positional is a bool indicating
180 * if this term carries positional information.
182 template<typename ACTION
>
184 parse_terms(Utf8Iterator itor
, unsigned break_flags
, bool with_positions
,
188 // Advance to the start of the next term.
191 if (itor
== Utf8Iterator()) return;
192 ch
= check_wordchar(*itor
);
198 // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
199 // Don't worry if there's a trailing '.' or not.
200 if (U_isupper(*itor
)) {
201 const Utf8Iterator end
;
202 Utf8Iterator p
= itor
;
204 Unicode::append_utf8(term
, Unicode::tolower(*p
++));
205 } while (p
!= end
&& *p
== '.' && ++p
!= end
&& U_isupper(*p
));
206 // One letter does not make an acronym! If we handled a single
207 // uppercase letter here, we wouldn't catch M&S below.
208 if (term
.size() > 1) {
209 // Check there's not a (lower case) letter or digit
210 // immediately after it.
211 if (p
== end
|| !Unicode::is_wordchar(*p
)) {
220 if (break_flags
&& is_unbroken_wordchar(*itor
)) {
221 if (!break_words(itor
, break_flags
, with_positions
, action
))
224 if (itor
== Utf8Iterator()) return;
225 ch
= check_wordchar(*itor
);
233 Unicode::append_utf8(term
, ch
);
235 if (++itor
== Utf8Iterator() ||
236 (break_flags
&& is_unbroken_script(*itor
)))
238 ch
= check_wordchar(*itor
);
241 Utf8Iterator
next(itor
);
243 if (next
== Utf8Iterator()) break;
244 unsigned nextch
= check_wordchar(*next
);
246 unsigned infix_ch
= *itor
;
247 if (is_digit(prevch
) && is_digit(*next
)) {
248 infix_ch
= check_infix_digit(infix_ch
);
250 // Handle things like '&' in AT&T, apostrophes, etc.
251 infix_ch
= check_infix(infix_ch
);
253 if (!infix_ch
) break;
254 if (infix_ch
!= UNICODE_IGNORE
)
255 Unicode::append_utf8(term
, infix_ch
);
261 size_t len
= term
.size();
263 while ((ch
= check_suffix(*itor
))) {
268 Unicode::append_utf8(term
, ch
);
269 if (++itor
== Utf8Iterator()) goto endofterm
;
271 // Don't index fish+chips as fish+ chips.
272 if (Unicode::is_wordchar(*itor
))
277 if (!action(term
, with_positions
, itor
.left()))
283 TermGenerator::Internal::index_text(Utf8Iterator itor
, termcount wdf_inc
,
284 string_view prefix
, bool with_positions
)
287 if (flags
& FLAG_WORD_BREAKS
) {
288 throw Xapian::FeatureUnavailableError("FLAG_WORD_BREAKS requires "
289 "building Xapian to use ICU");
292 unsigned break_flags
= flags
& (FLAG_NGRAMS
| FLAG_WORD_BREAKS
);
293 if (break_flags
== 0 && is_ngram_enabled()) {
294 break_flags
= FLAG_NGRAMS
;
297 stop_strategy current_stop_mode
;
299 current_stop_mode
= TermGenerator::STOP_NONE
;
301 current_stop_mode
= stop_mode
;
304 // Create two std::string objects which we effectively use as buffers to
305 // build the prefixed terms we generate (one for non-stemmed and one for
306 // stemmed terms). This is a simple way to avoid creating a temporary
307 // std::string object each time we generate a prefixed term.
308 string prefixed_term
;
309 auto prefix_size
= prefix
.size();
310 prefixed_term
.reserve(prefix_size
+ max_word_length
);
311 prefixed_term
.assign(prefix
);
313 string prefixed_stemmed_term
;
314 int add_z
= (strategy
!= TermGenerator::STEM_ALL
);
315 prefixed_stemmed_term
.reserve(add_z
+ prefix_size
+ max_word_length
);
316 if (add_z
) prefixed_stemmed_term
.assign(1, 'Z');
317 prefixed_stemmed_term
.append(prefix
);
318 auto prefixed_stemmed_size
= prefixed_stemmed_term
.size();
320 parse_terms(itor
, break_flags
, with_positions
,
321 [=, &prefixed_term
, &prefixed_stemmed_term
322 #if __cplusplus >= 201907L
323 // C++20 no longer supports implicit `this` in lambdas but older C++ versions
324 // don't allow `this` here.
327 ](const string
& term
, bool positional
, size_t) {
328 if (term
.size() > max_word_length
) return true;
330 if (current_stop_mode
== TermGenerator::STOP_ALL
&&
335 if (strategy
== TermGenerator::STEM_SOME
||
336 strategy
== TermGenerator::STEM_NONE
||
337 strategy
== TermGenerator::STEM_SOME_FULL_POS
) {
338 prefixed_term
.append(term
);
340 if (rare(cur_pos
>= pos_limit
))
341 throw Xapian::RangeError("termpos limit exceeded");
342 doc
.add_posting(prefixed_term
, ++cur_pos
, wdf_inc
);
344 doc
.add_term(prefixed_term
, wdf_inc
);
346 prefixed_term
.resize(prefix_size
);
349 // MSVC seems to need "this->" on member variables in this
351 if ((this->flags
& FLAG_SPELLING
) && prefix_size
== 0)
352 db
.add_spelling(term
);
354 if (strategy
== TermGenerator::STEM_NONE
|| stemmer
.is_none())
357 if (strategy
== TermGenerator::STEM_SOME
||
358 strategy
== TermGenerator::STEM_SOME_FULL_POS
) {
359 if (current_stop_mode
== TermGenerator::STOP_STEMMED
&&
363 // Note, this uses the lowercased term, but that's OK as we
364 // only want to avoid stemming terms starting with a digit.
365 if (!should_stem(term
)) return true;
368 // Add stemmed form without positional information.
369 const string
& stem
= stemmer(term
);
370 if (rare(stem
.empty())) return true;
371 prefixed_stemmed_term
.append(stem
);
372 if (strategy
!= TermGenerator::STEM_SOME
&& positional
) {
373 if (strategy
!= TermGenerator::STEM_SOME_FULL_POS
) {
374 if (rare(cur_pos
>= pos_limit
))
375 throw Xapian::RangeError("termpos limit exceeded");
378 doc
.add_posting(prefixed_stemmed_term
, cur_pos
, wdf_inc
);
380 doc
.add_term(prefixed_stemmed_term
, wdf_inc
);
382 prefixed_stemmed_term
.resize(prefixed_stemmed_size
);
395 Sniplet(double* r
, size_t t
, size_t h
)
396 : relevance(r
), term_end(t
), highlight(h
) { }
401 deque
<Sniplet
> best_pipe
;
403 // Requested length for snippet.
406 // Position in text of start of current pipe contents.
409 // Rolling sum of the current pipe contents.
412 size_t phrase_len
= 0;
415 size_t best_begin
= 0;
421 // Add one to length to allow for inter-word space.
422 // FIXME: We ought to correctly allow for multiple spaces.
423 explicit SnipPipe(size_t length_
) : length(length_
+ 1) { }
425 bool pump(double* r
, size_t t
, size_t h
, unsigned flags
);
429 bool drain(string_view input
,
430 string_view hi_start
,
439 SnipPipe::pump(double* r
, size_t t
, size_t h
, unsigned flags
)
442 if (pipe
.size() >= h
- 1) {
443 // The final term of a phrase is entering the window. Peg the
444 // phrase's relevance onto the first term of the phrase, so it'll
445 // be removed from `sum` when the phrase starts to leave the
447 auto & phrase_start
= pipe
[pipe
.size() - (h
- 1)];
448 if (phrase_start
.relevance
) {
449 *phrase_start
.relevance
*= DECAY
;
450 sum
-= *phrase_start
.relevance
;
453 phrase_start
.relevance
= r
;
454 phrase_start
.highlight
= h
;
460 pipe
.emplace_back(r
, t
, h
);
466 // If necessary, discard words from the start of the pipe until it has the
468 // FIXME: Also shrink the window past words with relevance < 0?
469 while (t
- begin
> length
/* || pipe.front().relevance < 0.0 */) {
470 const Sniplet
& word
= pipe
.front();
471 if (word
.relevance
) {
472 *word
.relevance
*= DECAY
;
473 sum
-= *word
.relevance
;
475 begin
= word
.term_end
;
476 if (best_end
>= begin
)
477 best_pipe
.push_back(word
);
479 // E.g. can happen if the current term is longer than the requested
481 if (rare(pipe
.empty())) break;
484 // Using > here doesn't work well, as we don't extend a snippet over terms
486 if (sum
>= best_sum
) {
487 // Discard any part of `best_pipe` which is before `begin`.
488 if (begin
>= best_end
) {
491 while (!best_pipe
.empty() &&
492 best_pipe
.front().term_end
<= begin
) {
493 best_pipe
.pop_front();
499 } else if ((flags
& Xapian::MSet::SNIPPET_EXHAUSTIVE
) == 0) {
500 if (best_sum
> 0 && best_end
< begin
) {
501 // We found something, and we aren't still looking near it.
502 // FIXME: Benchmark this and adjust if necessary.
512 // Discard any part of `pipe` which is after `best_end`.
513 if (begin
>= best_end
) {
516 // We should never empty the pipe (as that case should be handled
518 while (rare(!pipe
.empty()) &&
519 pipe
.back().term_end
> best_end
) {
525 // Check if a non-word character is should be included at the start of the
526 // snippet. We want to include certain leading non-word characters, but not
529 snippet_check_leading_nonwordchar(unsigned ch
) {
530 if (Unicode::is_currency(ch
) ||
531 Unicode::get_category(ch
) == Unicode::OPEN_PUNCTUATION
||
532 Unicode::get_category(ch
) == Unicode::INITIAL_QUOTE_PUNCTUATION
) {
549 case 0x00A1: // INVERTED EXCLAMATION MARK
550 case 0x00A7: // SECTION SIGN
551 case 0x00BF: // INVERTED QUESTION MARK
557 // Check if a non-word character is should be included at the end of the
558 // snippet. We want to include certain trailing non-word characters, but not
561 snippet_check_trailing_nonwordchar(unsigned ch
) {
562 if (Unicode::is_currency(ch
) ||
563 Unicode::get_category(ch
) == Unicode::CLOSE_PUNCTUATION
||
564 Unicode::get_category(ch
) == Unicode::FINAL_QUOTE_PUNCTUATION
) {
585 append_escaping_xml(const char* p
, const char* end
, string
& output
)
606 SnipPipe::drain(string_view input
,
607 string_view hi_start
,
612 if (best_pipe
.empty() && !pipe
.empty()) {
613 swap(best_pipe
, pipe
);
616 if (best_pipe
.empty()) {
617 size_t tail_len
= input
.size() - best_end
;
618 if (tail_len
== 0) return false;
620 // See if this is the end of a sentence.
621 // FIXME: This is quite simplistic - look at the Unicode rules:
622 // https://unicode.org/reports/tr29/#Sentence_Boundaries
623 bool sentence_end
= false;
624 Utf8Iterator
i(input
.data() + best_end
, tail_len
);
625 while (i
!= Utf8Iterator()) {
627 if (sentence_end
&& Unicode::is_whitespace(ch
)) break;
629 // Allow "...", "!!", "!?!", etc...
630 sentence_end
= (ch
== '.' || ch
== '?' || ch
== '!');
632 if (Unicode::is_wordchar(ch
)) break;
637 // Include end of sentence punctuation.
638 append_escaping_xml(input
.data() + best_end
, i
.raw(), output
);
642 // Include trailing punctuation which includes meaning or context.
643 i
.assign(input
.data() + best_end
, tail_len
);
644 int trailing_punc
= 0;
645 while (i
!= Utf8Iterator() && snippet_check_trailing_nonwordchar(*i
)) {
646 // But limit how much trailing punctuation we include.
647 if (++trailing_punc
> 4) {
654 append_escaping_xml(input
.data() + best_end
, i
.raw(), output
);
655 if (i
== Utf8Iterator()) return false;
658 // Append "..." or equivalent as this doesn't seem to be the start
665 const Sniplet
& word
= best_pipe
.front();
667 if (output
.empty()) {
668 // Start of the snippet.
669 enum { NO
, PUNC
, YES
} sentence_boundary
= (best_begin
== 0) ? YES
: NO
;
671 Utf8Iterator
i(input
.data() + best_begin
, word
.term_end
- best_begin
);
672 while (i
!= Utf8Iterator()) {
674 switch (sentence_boundary
) {
676 if (ch
== '.' || ch
== '?' || ch
== '!') {
677 sentence_boundary
= PUNC
;
681 if (Unicode::is_whitespace(ch
)) {
682 sentence_boundary
= YES
;
683 } else if (ch
== '.' || ch
== '?' || ch
== '!') {
684 // Allow "...", "!!", "!?!", etc...
686 sentence_boundary
= NO
;
693 // Start the snippet at the start of the first word, but include
694 // certain punctuation too.
695 if (Unicode::is_wordchar(ch
)) {
696 // But limit how much leading punctuation we include.
697 size_t word_begin
= i
.raw() - input
.data();
698 if (word_begin
- best_begin
> 4) {
699 best_begin
= word_begin
;
704 if (!snippet_check_leading_nonwordchar(ch
)) {
705 best_begin
= i
.raw() - input
.data();
709 // Add "..." or equivalent if this doesn't seem to be the start of a
711 if (sentence_boundary
!= YES
) {
716 if (word
.highlight
) {
717 // Don't include inter-word characters in the highlight.
718 Utf8Iterator
i(input
.data() + best_begin
, input
.size() - best_begin
);
719 while (i
!= Utf8Iterator()) {
721 if (Unicode::is_wordchar(ch
)) {
722 append_escaping_xml(input
.data() + best_begin
, i
.raw(), output
);
723 best_begin
= i
.raw() - input
.data();
731 phrase_len
= word
.highlight
;
732 if (phrase_len
) output
+= hi_start
;
735 const char* p
= input
.data();
736 append_escaping_xml(p
+ best_begin
, p
+ word
.term_end
, output
);
737 best_begin
= word
.term_end
;
739 if (phrase_len
&& --phrase_len
== 0) output
+= hi_end
;
741 best_pipe
.pop_front();
746 check_query(const Xapian::Query
& query
,
747 list
<vector
<string
>> & exact_phrases
,
748 unordered_map
<string
, double> & loose_terms
,
749 list
<const Xapian::Internal::QueryWildcard
*> & wildcards
,
750 list
<const Xapian::Internal::QueryEditDistance
*> & fuzzies
,
751 size_t & longest_phrase
)
753 // FIXME: OP_NEAR, non-tight OP_PHRASE, OP_PHRASE with non-term subqueries
754 size_t n_subqs
= query
.get_num_subqueries();
755 Xapian::Query::op op
= query
.get_type();
756 if (op
== query
.LEAF_TERM
) {
757 const Xapian::Internal::QueryTerm
& qt
=
758 *static_cast<const Xapian::Internal::QueryTerm
*>(query
.internal
.get());
759 loose_terms
.insert(make_pair(qt
.get_term(), 0));
760 } else if (op
== query
.OP_WILDCARD
) {
761 using Xapian::Internal::QueryWildcard
;
762 const QueryWildcard
* qw
=
763 static_cast<const QueryWildcard
*>(query
.internal
.get());
764 wildcards
.push_back(qw
);
765 } else if (op
== query
.OP_EDIT_DISTANCE
) {
766 using Xapian::Internal::QueryEditDistance
;
767 const QueryEditDistance
* qed
=
768 static_cast<const QueryEditDistance
*>(query
.internal
.get());
769 fuzzies
.push_back(qed
);
770 } else if (op
== query
.OP_PHRASE
) {
771 const Xapian::Internal::QueryPhrase
& phrase
=
772 *static_cast<const Xapian::Internal::QueryPhrase
*>(query
.internal
.get());
773 if (phrase
.get_window() == n_subqs
) {
775 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
776 if (query
.get_subquery(i
).get_type() != query
.LEAF_TERM
)
777 goto non_term_subquery
;
780 // Tight phrase of terms.
781 exact_phrases
.push_back(vector
<string
>());
782 vector
<string
> & terms
= exact_phrases
.back();
783 terms
.reserve(n_subqs
);
784 for (size_t i
= 0; i
!= n_subqs
; ++i
) {
785 Xapian::Query q
= query
.get_subquery(i
);
786 const Xapian::Internal::QueryTerm
& qt
=
787 *static_cast<const Xapian::Internal::QueryTerm
*>(q
.internal
.get());
788 terms
.push_back(qt
.get_term());
790 if (n_subqs
> longest_phrase
) longest_phrase
= n_subqs
;
795 for (size_t i
= 0; i
!= n_subqs
; ++i
)
796 check_query(query
.get_subquery(i
), exact_phrases
, loose_terms
,
797 wildcards
, fuzzies
, longest_phrase
);
801 check_term(unordered_map
<string
, double> & loose_terms
,
802 const Xapian::Weight::Internal
* stats
,
806 auto it
= loose_terms
.find(term
);
807 if (it
== loose_terms
.end()) return NULL
;
809 if (it
->second
== 0.0) {
811 if (!stats
->get_termweight(term
, relevance
)) {
813 loose_terms
.erase(it
);
817 it
->second
= relevance
+ max_tw
;
823 MSet::Internal::snippet(string_view text
,
825 const Xapian::Stem
& stemmer
,
827 string_view hi_start
,
829 string_view omit
) const
831 if (hi_start
.empty() && hi_end
.empty() && text
.size() <= length
) {
837 if (flags
& MSet::SNIPPET_WORD_BREAKS
) {
838 throw Xapian::FeatureUnavailableError("SNIPPET_WORD_BREAKS requires "
839 "building Xapian to use ICU");
842 auto SNIPPET_BREAK_MASK
= MSet::SNIPPET_NGRAMS
| MSet::SNIPPET_WORD_BREAKS
;
843 unsigned break_flags
= flags
& SNIPPET_BREAK_MASK
;
844 if (break_flags
== 0 && is_ngram_enabled()) {
845 break_flags
= MSet::SNIPPET_NGRAMS
;
848 size_t term_start
= 0;
849 double min_tw
= 0, max_tw
= 0;
850 if (stats
) stats
->get_max_termweight(min_tw
, max_tw
);
854 // Scale up by (1 + 1/64) so that highlighting works better for terms
855 // with termweight 0 (which happens for terms not in the database, and
856 // also with some weighting schemes for terms which occur in almost all
863 query
= enquire
->query
;
865 SnipPipe
snip(length
);
867 list
<vector
<string
>> exact_phrases
;
868 unordered_map
<string
, double> loose_terms
;
869 list
<const Xapian::Internal::QueryWildcard
*> wildcards
;
870 list
<const Xapian::Internal::QueryEditDistance
*> fuzzies
;
871 size_t longest_phrase
= 0;
872 check_query(query
, exact_phrases
, loose_terms
,
873 wildcards
, fuzzies
, longest_phrase
);
875 vector
<double> exact_phrases_relevance
;
876 exact_phrases_relevance
.reserve(exact_phrases
.size());
877 for (auto&& terms
: exact_phrases
) {
878 // FIXME: What relevance to use?
879 exact_phrases_relevance
.push_back(max_tw
* terms
.size());
882 vector
<double> wildcards_relevance
;
883 wildcards_relevance
.reserve(wildcards
.size());
884 for (auto&& pattern
: wildcards
) {
885 // FIXME: What relevance to use?
887 wildcards_relevance
.push_back(max_tw
+ min_tw
);
890 vector
<double> fuzzies_relevance
;
891 fuzzies_relevance
.reserve(fuzzies
.size());
892 for (auto&& pattern
: fuzzies
) {
893 // FIXME: What relevance to use?
895 fuzzies_relevance
.push_back(max_tw
+ min_tw
);
898 // Background relevance is the same for a given MSet, so cache it
899 // between calls to MSet::snippet() on the same object.
900 unordered_map
<string
, double>& background
= snippet_bg_relevance
;
902 vector
<string
> phrase
;
903 if (longest_phrase
) phrase
.resize(longest_phrase
- 1);
904 size_t phrase_next
= 0;
905 bool matchfound
= false;
906 parse_terms(Utf8Iterator(text
), break_flags
, true,
907 [&](const string
& term
, bool positional
, size_t left
) {
908 // FIXME: Don't hardcode this here.
909 const size_t max_word_length
= 64;
911 if (!positional
) return true;
912 if (term
.size() > max_word_length
) return true;
914 // We get segments with any "inter-word" characters in front of
916 // [The][ cat][ sat][ on][ the][ mat]
917 size_t term_end
= text
.size() - left
;
919 double* relevance
= 0;
920 size_t highlight
= 0;
923 for (auto&& terms
: exact_phrases
) {
924 if (term
== terms
.back()) {
925 size_t n
= terms
.size() - 1;
927 while (UNSIGNED_OVERFLOW_OK(n
--)) {
928 if (terms
[n
] != phrase
[(n
+ phrase_next
) % (longest_phrase
- 1)]) {
934 // FIXME: Sort phrases, highest score first!
935 relevance
= &exact_phrases_relevance
[i
];
936 highlight
= terms
.size();
943 relevance
= check_term(loose_terms
, stats
.get(), term
, max_tw
);
945 // Matched unstemmed term.
951 stem
+= stemmer(term
);
952 relevance
= check_term(loose_terms
, stats
.get(), stem
, max_tw
);
954 // Matched stemmed term.
960 // FIXME: Sort wildcards, cheapest to check first or something?
962 for (auto&& qw
: wildcards
) {
963 if (qw
->test(term
)) {
964 relevance
= &wildcards_relevance
[i
];
972 // FIXME: Sort fuzzies, cheapest to check first or something?
974 for (auto&& qed
: fuzzies
) {
975 // test() returns 0 for no match, or edit_distance + 1.
976 int ed_result
= qed
->test(term
);
978 // FIXME: Reduce relevance the more edits there are?
979 // We can't just divide by ed_result here as this
980 // relevance is used by any term matching this
982 relevance
= &fuzzies_relevance
[i
];
989 if (flags
& Xapian::MSet::SNIPPET_BACKGROUND_MODEL
) {
990 // Background document model.
991 auto bgit
= background
.find(term
);
992 if (bgit
== background
.end()) bgit
= background
.find(stem
);
993 if (bgit
== background
.end()) {
994 Xapian::doccount tf
= enquire
->db
.get_termfreq(term
);
996 tf
= enquire
->db
.get_termfreq(stem
);
1002 // Add one to avoid log(0) when a term indexes all
1004 Xapian::doccount num_docs
= stats
->collection_size
+ 1;
1005 r
= max_tw
* log((num_docs
- tf
) / double(tf
));
1006 r
/= (length
+ 1) * log(double(num_docs
));
1009 Utf8Iterator
i(text
.data() + term_start
, text
.data() + term_end
);
1010 while (i
!= Utf8Iterator()) {
1011 if (Unicode::get_category(*i
++) == Unicode::UPPERCASE_LETTER
) {
1018 bgit
= background
.emplace(make_pair(stem
, r
)).first
;
1020 relevance
= &bgit
->second
;
1024 // In the absence of weight information, assume longer terms
1025 // are more relevant, and that unstemmed matches are a bit more
1026 // relevant than stemmed matches.
1027 if (queryterms
.find(term
) != queryterms
.end()) {
1028 relevance
= term
.size() * 3;
1031 stem
+= stemmer(term
);
1032 if (queryterms
.find(stem
) != queryterms
.end()) {
1033 relevance
= term
.size() * 2;
1039 // FIXME: Allow Enquire without a DB set or an empty MSet() to be
1040 // used if you don't want the collection model?
1043 // FIXME: Punctuation should somehow be included in the model, but this
1044 // approach is problematic - we don't want the first word of a sentence
1045 // to be favoured when it's at the end of the window.
1047 // Give first word in each sentence a relevance boost.
1048 if (term_start
== 0) {
1051 for (size_t i
= term_start
; i
+ term
.size() < term_end
; ++i
) {
1052 if (text
[i
] == '.' && Unicode::is_whitespace(text
[i
+ 1])) {
1061 if (longest_phrase
) {
1062 phrase
[phrase_next
] = term
;
1063 phrase_next
= (phrase_next
+ 1) % (longest_phrase
- 1);
1066 if (highlight
) matchfound
= true;
1068 if (!snip
.pump(relevance
, term_end
, highlight
, flags
)) return false;
1070 term_start
= term_end
;
1076 // Put together the snippet.
1078 if (matchfound
|| (flags
& SNIPPET_EMPTY_WITHOUT_MATCH
) == 0) {
1079 while (snip
.drain(text
, hi_start
, hi_end
, omit
, result
)) { }