1 /* omdocument.cc: class for performing a match
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2003,2004,2006,2007,2008,2009,2011,2013,2014,2018 Olly Betts
6 * Copyright 2009 Lemur Consulting Ltd
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #include <xapian/document.h>
28 #include "backends/document.h"
29 #include "documentvaluelist.h"
30 #include "maptermlist.h"
31 #include "net/serialise.h"
34 #include "unicode/description_append.h"
36 #include <xapian/error.h>
37 #include <xapian/types.h>
38 #include <xapian/valueiterator.h>
48 // implementation of Document
50 Document::Document(Document::Internal
*internal_
) : internal(internal_
)
54 Document::Document(Document
&&) = default;
57 Document::operator=(Document
&&) = default;
59 Document::Document() : internal(new Xapian::Document::Internal
)
64 Document::get_value(Xapian::valueno slot
) const
66 LOGCALL(API
, string
, "Document::get_value", slot
);
67 RETURN(internal
->get_value(slot
));
71 Document::get_data() const
73 LOGCALL(API
, string
, "Document::get_data", NO_ARGS
);
74 RETURN(internal
->get_data());
78 Document::set_data(const string
&data
)
80 LOGCALL_VOID(API
, "Document::set_data", data
);
81 internal
->set_data(data
);
85 Document::operator=(const Document
&other
)
87 // pointers are reference counted.
88 internal
= other
.internal
;
91 Document::Document(const Document
&other
)
92 : internal(other
.internal
)
101 Document::get_description() const
103 return internal
->get_description();
107 Document::add_value(Xapian::valueno slot
, const string
&value
)
109 LOGCALL_VOID(API
, "Document::add_value", slot
| value
);
110 internal
->add_value(slot
, value
);
114 Document::remove_value(Xapian::valueno slot
)
116 LOGCALL_VOID(API
, "Document::remove_value", slot
);
117 internal
->remove_value(slot
);
121 Document::clear_values()
123 LOGCALL_VOID(API
, "Document::clear_values", NO_ARGS
);
124 internal
->clear_values();
128 Document::add_posting(const string
& tname
,
129 Xapian::termpos tpos
,
130 Xapian::termcount wdfinc
)
132 LOGCALL_VOID(API
, "Document::add_posting", tname
| tpos
| wdfinc
);
134 throw InvalidArgumentError("Empty termnames aren't allowed.");
136 internal
->add_posting(tname
, tpos
, wdfinc
);
140 Document::add_term(const string
& tname
, Xapian::termcount wdfinc
)
142 LOGCALL_VOID(API
, "Document::add_term", tname
| wdfinc
);
144 throw InvalidArgumentError("Empty termnames aren't allowed.");
146 internal
->add_term(tname
, wdfinc
);
150 Document::remove_posting(const string
& tname
, Xapian::termpos tpos
,
151 Xapian::termcount wdfdec
)
153 LOGCALL_VOID(API
, "Document::remove_posting", tname
| tpos
| wdfdec
);
155 throw InvalidArgumentError("Empty termnames aren't allowed.");
157 internal
->remove_posting(tname
, tpos
, wdfdec
);
161 Document::remove_postings(const string
& term
,
162 Xapian::termpos termpos_first
,
163 Xapian::termpos termpos_last
,
164 Xapian::termcount wdf_dec
)
167 throw InvalidArgumentError("Empty termnames aren't allowed.");
169 if (rare(termpos_first
> termpos_last
)) {
172 return internal
->remove_postings(term
, termpos_first
, termpos_last
,
177 Document::remove_term(const string
& tname
)
179 LOGCALL_VOID(API
, "Document::remove_term", tname
);
180 internal
->remove_term(tname
);
184 Document::clear_terms()
186 LOGCALL_VOID(API
, "Document::clear_terms", NO_ARGS
);
187 internal
->clear_terms();
191 Document::termlist_count() const {
192 LOGCALL(API
, Xapian::termcount
, "Document::termlist_count", NO_ARGS
);
193 RETURN(internal
->termlist_count());
197 Document::termlist_begin() const
199 LOGCALL(API
, TermIterator
, "Document::termlist_begin", NO_ARGS
);
200 RETURN(TermIterator(internal
->open_term_list()));
204 Document::values_count() const {
205 LOGCALL(API
, Xapian::termcount
, "Document::values_count", NO_ARGS
);
206 RETURN(internal
->values_count());
210 Document::values_begin() const
212 LOGCALL(API
, ValueIterator
, "Document::values_begin", NO_ARGS
);
213 // Calling values_count() has the side effect of making sure that they have
214 // been read into the std::map "values" member of internal.
215 if (internal
->values_count() == 0) RETURN(ValueIterator());
216 RETURN(ValueIterator(new DocumentValueList(internal
)));
220 Document::get_docid() const
222 LOGCALL(API
, docid
, "Document::get_docid", NO_ARGS
);
223 RETURN(internal
->get_docid());
227 Document::serialise() const
229 LOGCALL(API
, std::string
, "Document::serialise", NO_ARGS
);
230 RETURN(serialise_document(*this));
234 Document::unserialise(const std::string
&s
)
236 LOGCALL_STATIC(API
, Document
, "Document::unserialise", s
);
237 RETURN(unserialise_document(s
));
242 /////////////////////////////////////////////////////////////////////////////
245 OmDocumentTerm::merge() const
247 Assert(!is_deleted());
248 inplace_merge(positions
.begin(),
249 positions
.begin() + split
,
255 OmDocumentTerm::add_position(Xapian::termcount wdf_inc
, Xapian::termpos tpos
)
257 LOGCALL(DB
, bool, "OmDocumentTerm::add_position", wdf_inc
| tpos
);
259 if (rare(is_deleted())) {
262 positions
.push_back(tpos
);
268 // Optimise the common case of adding positions in ascending order.
269 if (positions
.empty()) {
270 positions
.push_back(tpos
);
273 if (tpos
> positions
.back()) {
275 // Check for duplicate before split.
276 auto i
= lower_bound(positions
.cbegin(),
277 positions
.cbegin() + split
,
279 if (i
!= positions
.cbegin() + split
&& *i
== tpos
)
282 positions
.push_back(tpos
);
286 if (tpos
== positions
.back()) {
287 // Duplicate of last entry.
292 // We could merge in the new entry at the same time, but that seems to
293 // make things much more complex for minor gains.
297 // Search for the position the term occurs at. Use binary chop to
298 // search, since this is a sorted list.
299 vector
<Xapian::termpos
>::iterator i
;
300 i
= lower_bound(positions
.begin(), positions
.end(), tpos
);
301 if (i
== positions
.end() || *i
!= tpos
) {
302 auto new_split
= positions
.size();
303 if (sizeof(split
) < sizeof(Xapian::termpos
)) {
304 if (rare(new_split
> numeric_limits
<decltype(split
)>::max())) {
305 // The split point would be beyond the size of the type used to
306 // hold it, which is really unlikely if that type is 32-bit.
307 // Just insert the old way in this case.
308 positions
.insert(i
, tpos
);
312 // This assertion should always be true because we shouldn't have
313 // duplicate entries and the split point can't be after the final
315 AssertRel(new_split
, <=, numeric_limits
<decltype(split
)>::max());
318 positions
.push_back(tpos
);
324 OmDocumentTerm::remove_position(Xapian::termpos tpos
)
326 LOGCALL_VOID(DB
, "OmDocumentTerm::remove_position", tpos
);
328 Assert(!is_deleted());
330 if (rare(positions
.empty())) {
332 throw Xapian::InvalidArgumentError("Position " + str(tpos
) +
333 " not in list, can't remove");
336 // Special case removing the final position, which we can handle in O(1).
337 if (positions
.back() == tpos
) {
338 positions
.pop_back();
339 if (split
== positions
.size()) {
341 // We removed the only entry from after the split.
347 // We could remove the requested entry at the same time, but that seems
352 // We keep positions sorted, so use lower_bound() which can binary chop to
354 auto i
= lower_bound(positions
.begin(), positions
.end(), tpos
);
355 if (i
== positions
.end() || *i
!= tpos
) {
362 OmDocumentTerm::remove_positions(Xapian::termpos termpos_first
,
363 Xapian::termpos termpos_last
)
365 LOGCALL(DB
, Xapian::termpos
, "OmDocumentTerm::remove_position", termpos_first
| termpos_last
);
367 Assert(!is_deleted());
370 // We could remove the requested entries at the same time, but that
371 // seems fiddly to do.
375 // Find the range [i, j) that the specified termpos range maps to. Use
376 // binary chop to search, since this is a sorted list.
377 auto i
= lower_bound(positions
.begin(), positions
.end(), termpos_first
);
378 if (i
== positions
.end() || *i
> termpos_last
) {
381 auto j
= upper_bound(i
, positions
.end(), termpos_last
);
382 size_t size_before
= positions
.size();
383 positions
.erase(i
, j
);
384 return Xapian::termpos(size_before
- positions
.size());
388 OmDocumentTerm::get_description() const
391 description
= "OmDocumentTerm(wdf = ";
392 description
+= str(wdf
);
393 description
+= ", positions[";
394 description
+= str(positions
.size());
400 Xapian::Document::Internal::get_value(Xapian::valueno slot
) const
403 map
<Xapian::valueno
, string
>::const_iterator i
;
404 i
= values
.find(slot
);
405 if (i
== values
.end()) return string();
408 if (!database
.get()) return string();
409 return do_get_value(slot
);
413 Xapian::Document::Internal::get_data() const
415 LOGCALL(DB
, string
, "Xapian::Document::Internal::get_data", NO_ARGS
);
416 if (data_here
) RETURN(data
);
417 if (!database
.get()) RETURN(string());
418 RETURN(do_get_data());
422 Xapian::Document::Internal::set_data(const string
&data_
)
429 Xapian::Document::Internal::open_term_list() const
431 LOGCALL(DB
, TermList
*, "Document::Internal::open_term_list", NO_ARGS
);
433 RETURN(new MapTermList(terms
.begin(), terms
.end()));
435 if (!database
.get()) RETURN(NULL
);
436 RETURN(database
->open_term_list(did
));
440 Xapian::Document::Internal::add_value(Xapian::valueno slot
, const string
&value
)
443 if (!value
.empty()) {
444 values
[slot
] = value
;
446 // Empty values aren't stored, but replace any existing value by
453 Xapian::Document::Internal::remove_value(Xapian::valueno slot
)
456 map
<Xapian::valueno
, string
>::iterator i
= values
.find(slot
);
457 if (i
== values
.end()) {
458 throw Xapian::InvalidArgumentError("Value #" + str(slot
) +
459 " is not present in document, in "
460 "Xapian::Document::Internal::remove_value()");
466 Xapian::Document::Internal::clear_values()
473 Xapian::Document::Internal::add_posting(const string
& tname
, Xapian::termpos tpos
,
474 Xapian::termcount wdfinc
)
477 positions_modified
= true;
479 map
<string
, OmDocumentTerm
>::iterator i
;
480 i
= terms
.find(tname
);
481 if (i
== terms
.end()) {
483 OmDocumentTerm
newterm(wdfinc
);
484 newterm
.append_position(tpos
);
485 terms
.insert(make_pair(tname
, std::move(newterm
)));
487 if (i
->second
.add_position(wdfinc
, tpos
))
493 Xapian::Document::Internal::add_term(const string
& tname
, Xapian::termcount wdfinc
)
497 map
<string
, OmDocumentTerm
>::iterator i
;
498 i
= terms
.find(tname
);
499 if (i
== terms
.end()) {
501 OmDocumentTerm
newterm(wdfinc
);
502 terms
.insert(make_pair(tname
, std::move(newterm
)));
504 if (i
->second
.increase_wdf(wdfinc
))
510 Xapian::Document::Internal::remove_posting(const string
& tname
,
511 Xapian::termpos tpos
,
512 Xapian::termcount wdfdec
)
516 map
<string
, OmDocumentTerm
>::iterator i
;
517 i
= terms
.find(tname
);
518 if (i
== terms
.end() || i
->second
.is_deleted()) {
520 throw Xapian::InvalidArgumentError("Empty termnames are invalid");
521 throw Xapian::InvalidArgumentError("Term '" + tname
+
522 "' is not present in document, in "
523 "Xapian::Document::Internal::remove_posting()");
525 i
->second
.remove_position(tpos
);
526 if (wdfdec
) i
->second
.decrease_wdf(wdfdec
);
527 positions_modified
= true;
531 Xapian::Document::Internal::remove_postings(const string
& term
,
532 Xapian::termpos termpos_first
,
533 Xapian::termpos termpos_last
,
534 Xapian::termcount wdf_dec
)
538 auto i
= terms
.find(term
);
539 if (i
== terms
.end() || i
->second
.is_deleted()) {
541 throw Xapian::InvalidArgumentError("Empty termnames are invalid");
542 throw Xapian::InvalidArgumentError("Term '" + term
+
543 "' is not present in document, in "
544 "Xapian::Document::Internal::remove_postings()");
546 auto n_removed
= i
->second
.remove_positions(termpos_first
, termpos_last
);
548 positions_modified
= true;
550 Xapian::termcount wdf_delta
;
551 if (mul_overflows(n_removed
, wdf_dec
, wdf_delta
)) {
552 // Decreasing by the maximum value will zero the wdf.
553 wdf_delta
= numeric_limits
<Xapian::termcount
>::max();
555 i
->second
.decrease_wdf(wdf_delta
);
562 Xapian::Document::Internal::remove_term(const string
& tname
)
565 map
<string
, OmDocumentTerm
>::iterator i
;
566 i
= terms
.find(tname
);
567 if (i
== terms
.end() || i
->second
.is_deleted()) {
569 throw Xapian::InvalidArgumentError("Empty termnames are invalid");
570 throw Xapian::InvalidArgumentError("Term '" + tname
+
571 "' is not present in document, in "
572 "Xapian::Document::Internal::remove_term()");
575 if (!positions_modified
) {
576 positions_modified
= (i
->second
.positionlist_count() > 0);
582 Xapian::Document::Internal::clear_terms()
587 // Assume there was a term with positions for now.
588 // FIXME: may be worth checking...
589 positions_modified
= true;
593 Xapian::Document::Internal::termlist_count() const
596 // How equivalent is this line to the rest?
597 // return database.get() ? database->open_term_list(did)->get_approx_size() : 0;
601 return termlist_size
;
605 Xapian::Document::Internal::need_terms() const
607 if (terms_here
) return;
608 if (database
.get()) {
609 Xapian::TermIterator
t(database
->open_term_list(did
));
610 Xapian::TermIterator
tend(NULL
);
611 for ( ; t
!= tend
; ++t
) {
612 Xapian::PositionIterator p
= t
.positionlist_begin();
613 OmDocumentTerm
term(t
.get_wdf());
614 for ( ; p
!= t
.positionlist_end(); ++p
) {
615 term
.append_position(*p
);
617 terms
.insert(terms
.end(), make_pair(*t
, std::move(term
)));
620 termlist_size
= terms
.size();
625 Xapian::Document::Internal::values_count() const
627 LOGCALL(DB
, Xapian::valueno
, "Document::Internal::values_count", NO_ARGS
);
630 RETURN(values
.size());
634 Xapian::Document::Internal::get_description() const
636 string desc
= "Document(";
638 // description_append ?
641 description_append(desc
, data
);
646 if (data_here
) desc
+= ", ";
648 desc
+= str(values
.size());
653 if (data_here
|| values_here
) desc
+= ", ";
655 desc
+= str(termlist_size
);
659 if (database
.get()) {
660 if (data_here
|| values_here
|| terms_here
) desc
+= ", ";
661 // database->get_description() if/when that returns a non-generic
672 Xapian::Document::Internal::need_values() const
675 if (database
.get()) {
676 Assert(values
.empty());
677 do_get_all_values(values
);
683 Xapian::Document::Internal::~Internal()
686 database
->invalidate_doc_object(this);