Reimplement Language Modelling weights
[xapian.git] / xapian-core / weight / weight.cc
blob024291f9ac2bd0b5080ffe6ada1ff29957dc0e9b
1 /** @file
2 * @brief Xapian::Weight base class
3 */
4 /* Copyright (C) 2007,2008,2009,2014,2017,2019,2024 Olly Betts
5 * Copyright (C) 2009 Lemur Consulting Ltd
6 * Copyright (C) 2017 Vivek Pal
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "xapian/weight.h"
27 #include "backends/leafpostlist.h"
28 #include "weightinternal.h"
30 #include "omassert.h"
31 #include "debuglog.h"
33 #include "xapian/error.h"
35 using namespace std;
37 namespace Xapian {
39 void
40 Weight::init_(const Internal & stats, Xapian::termcount query_length,
41 const Xapian::Database::Internal* shard)
43 LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | shard);
44 collection_size_ = stats.collection_size;
45 rset_size_ = stats.rset_size;
46 if (stats_needed & AVERAGE_LENGTH)
47 average_length_ = stats.get_average_length();
48 if (stats_needed & DOC_LENGTH_MAX)
49 doclength_upper_bound_ = shard->get_doclength_upper_bound();
50 if (stats_needed & DOC_LENGTH_MIN)
51 doclength_lower_bound_ = shard->get_doclength_lower_bound();
52 if (stats_needed & UNIQUE_TERMS_MAX)
53 unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
54 if (stats_needed & UNIQUE_TERMS_MIN)
55 unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
56 if (stats_needed & TOTAL_LENGTH)
57 total_length_ = stats.total_length;
58 if (stats_needed & DB_DOC_LENGTH_MAX)
59 db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
60 if (stats_needed & DB_DOC_LENGTH_MIN)
61 db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
62 if (stats_needed & DB_UNIQUE_TERMS_MAX)
63 db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
64 if (stats_needed & DB_UNIQUE_TERMS_MIN)
65 db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
66 collectionfreq_ = 0;
67 wdf_upper_bound_ = 0;
68 termfreq_ = 0;
69 reltermfreq_ = 0;
70 query_length_ = query_length;
71 wqf_ = 1;
72 init(0.0);
75 void
76 Weight::init_(const Internal & stats, Xapian::termcount query_length,
77 const string & term, Xapian::termcount wqf, double factor,
78 const Xapian::Database::Internal* shard,
79 void* postlist_void)
81 LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | term | wqf | factor | shard | postlist_void);
82 collection_size_ = stats.collection_size;
83 rset_size_ = stats.rset_size;
84 if (stats_needed & AVERAGE_LENGTH)
85 average_length_ = stats.get_average_length();
86 if (stats_needed & DOC_LENGTH_MAX)
87 doclength_upper_bound_ = shard->get_doclength_upper_bound();
88 if (stats_needed & DOC_LENGTH_MIN)
89 doclength_lower_bound_ = shard->get_doclength_lower_bound();
90 if (stats_needed & UNIQUE_TERMS_MAX)
91 unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
92 if (stats_needed & UNIQUE_TERMS_MIN)
93 unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
94 if (stats_needed & TOTAL_LENGTH)
95 total_length_ = stats.total_length;
96 if (stats_needed & WDF_MAX) {
97 auto postlist = static_cast<LeafPostList*>(postlist_void);
98 wdf_upper_bound_ = postlist->get_wdf_upper_bound();
100 if (stats_needed & DB_DOC_LENGTH_MAX)
101 db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
102 if (stats_needed & DB_DOC_LENGTH_MIN)
103 db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
104 if (stats_needed & DB_UNIQUE_TERMS_MAX)
105 db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
106 if (stats_needed & DB_UNIQUE_TERMS_MIN)
107 db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
108 if (stats_needed & DB_WDF_MAX) {
109 // FIXME: Nothing uses this stat, so for now return a correct but
110 // likely fairly loose upper bound. Once we have something that
111 // wants to use this we can implement tracking a per-term wdf_max
112 // across the whole database.
113 db_wdf_upper_bound_ = stats.db_doclength_upper_bound;
115 if (stats_needed & (TERMFREQ | RELTERMFREQ | COLLECTION_FREQ)) {
116 bool ok = stats.get_stats(term,
117 termfreq_, reltermfreq_, collectionfreq_);
118 (void)ok;
119 Assert(ok);
121 query_length_ = query_length;
122 wqf_ = wqf;
123 init(factor);
126 void
127 Weight::init_(const Internal & stats, Xapian::termcount query_length,
128 double factor, Xapian::doccount termfreq,
129 Xapian::doccount reltermfreq, Xapian::termcount collection_freq,
130 const Xapian::Database::Internal* shard)
132 LOGCALL_VOID(MATCH, "Weight::init_", stats | query_length | factor | termfreq | reltermfreq | collection_freq | shard);
133 // Synonym case.
134 collection_size_ = stats.collection_size;
135 rset_size_ = stats.rset_size;
136 if (stats_needed & AVERAGE_LENGTH)
137 average_length_ = stats.get_average_length();
138 if (stats_needed & (DOC_LENGTH_MAX | WDF_MAX)) {
139 doclength_upper_bound_ = shard->get_doclength_upper_bound();
140 // The doclength is an upper bound on the wdf. This is obviously true
141 // for normal terms, but SynonymPostList ensures that it is also true
142 // for synonym terms by clamping the wdf values returned to the
143 // doclength.
145 // (This clamping is only actually necessary in cases where a
146 // constituent term of the synonym is repeated.)
147 wdf_upper_bound_ = doclength_upper_bound_;
149 if (stats_needed & DOC_LENGTH_MIN)
150 doclength_lower_bound_ = shard->get_doclength_lower_bound();
151 if (stats_needed & UNIQUE_TERMS_MAX)
152 unique_terms_upper_bound_ = shard->get_unique_terms_upper_bound();
153 if (stats_needed & UNIQUE_TERMS_MIN)
154 unique_terms_lower_bound_ = shard->get_unique_terms_lower_bound();
155 if (stats_needed & TOTAL_LENGTH)
156 total_length_ = stats.total_length;
157 if (stats_needed & (DB_DOC_LENGTH_MAX | DB_WDF_MAX)) {
158 db_doclength_upper_bound_ = stats.db_doclength_upper_bound;
159 // The doclength is an upper bound on the wdf. This is obviously true
160 // for normal terms, but SynonymPostList ensures that it is also true
161 // for synonym terms by clamping the wdf values returned to the
162 // doclength.
164 // (This clamping is only actually necessary in cases where a
165 // constituent term of the synonym is repeated.)
166 db_wdf_upper_bound_ = db_doclength_upper_bound_;
168 if (stats_needed & DB_DOC_LENGTH_MIN)
169 db_doclength_lower_bound_ = stats.db_doclength_lower_bound;
170 if (stats_needed & DB_UNIQUE_TERMS_MAX)
171 db_unique_terms_upper_bound_ = stats.db_unique_terms_upper_bound;
172 if (stats_needed & DB_UNIQUE_TERMS_MIN)
173 db_unique_terms_lower_bound_ = stats.db_unique_terms_lower_bound;
175 termfreq_ = termfreq;
176 reltermfreq_ = reltermfreq;
177 query_length_ = query_length;
178 collectionfreq_ = collection_freq;
179 wqf_ = 1;
180 init(factor);
183 Weight::~Weight() { }
185 string
186 Weight::name() const
188 return string();
191 string
192 Weight::short_name() const
194 return string();
197 string
198 Weight::serialise() const
200 throw Xapian::UnimplementedError("serialise() not supported for this Xapian::Weight subclass");
203 Weight *
204 Weight::unserialise(const string &) const
206 throw Xapian::UnimplementedError("unserialise() not supported for this Xapian::Weight subclass");
209 double
210 Weight::get_sumextra(Xapian::termcount,
211 Xapian::termcount,
212 Xapian::termcount) const
214 return 0.0;
217 double
218 Weight::get_maxextra() const
220 return 0.0;
223 const Weight *
224 Weight::create(const string & s, const Registry & reg)
226 const char *p = s.c_str();
227 std::string scheme;
229 while (*p != ' ') {
230 if (*p == '\0') break;
231 scheme += *p;
232 p++;
235 if (*p == ' ') p++;
236 return reg.get_weighting_scheme(scheme)->create_from_parameters(p);
239 Weight *
240 Weight::create_from_parameters(const char *) const
242 throw Xapian::UnimplementedError("create_from_parameters() not supported for this Xapian::Weight subclass");