Merge bm25weight2 and tradweight2 into weight1
[xapian.git] / xapian-core / tests / api_weight.cc
blob8279705ed87f60030b58992ea1772e06ed366b25
1 /** @file
2 * @brief tests of Xapian::Weight subclasses
3 */
4 /* Copyright (C) 2004-2024 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include <config.h>
25 #include "api_weight.h"
26 #include <cmath>
27 #include <memory>
29 #include <xapian.h>
31 #include "apitest.h"
32 #include "heap.h"
33 #include "testutils.h"
35 using namespace std;
37 template<class W>
38 static inline void
39 test_weight_class_no_params(const char* name)
41 tout << name << '\n';
42 W obj;
43 // Check name() returns the class name.
44 TEST_EQUAL(obj.name(), name);
45 // If there are no parameters, there's nothing to serialise.
46 string obj_serialised = obj.serialise();
47 TEST_EQUAL(obj_serialised.size(), 0);
48 // Check serialising and unserialising gives object with same serialisation.
49 unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
50 TEST_EQUAL(obj_serialised, wt->serialise());
51 // Check that unserialise() throws suitable error for bad serialisation.
52 // The easy case to test is extra junk after the serialised weight.
53 try {
54 unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
55 FAIL_TEST(name << " did not throw for unserialise with junk appended");
56 } catch (const Xapian::SerialisationError& e) {
57 // Check the exception message contains the weighting scheme name
58 // (regression test for TradWeight's exception saying "BM25").
59 string target = name + CONST_STRLEN("Xapian::");
60 TEST(e.get_msg().find(target) != string::npos);
64 #define TEST_WEIGHT_CLASS_NO_PARAMS(W) test_weight_class_no_params<W>(#W)
66 template<class W>
67 static inline void
68 test_weight_class(const char* name, const W& obj_default, const W& obj_other)
70 tout << name << '\n';
71 W obj;
72 // Check name() returns the class name.
73 TEST_EQUAL(obj.name(), name);
74 TEST_EQUAL(obj_default.name(), name);
75 TEST_EQUAL(obj_other.name(), name);
76 // Check serialisation matches that of object constructed with explicit
77 // parameter values of what the defaults are meant to be.
78 string obj_serialised = obj.serialise();
79 TEST_EQUAL(obj_serialised, obj_default.serialise());
80 // Check serialisation is different to object with different parameters.
81 string obj_other_serialised = obj_other.serialise();
82 TEST_NOT_EQUAL(obj_serialised, obj_other_serialised);
83 // Check serialising and unserialising gives object with same serialisation.
84 unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
85 TEST_EQUAL(obj_serialised, wt->serialise());
86 // Check serialising and unserialising of object with different parameters.
87 unique_ptr<Xapian::Weight> wt2(W().unserialise(obj_other_serialised));
88 TEST_EQUAL(obj_other_serialised, wt2->serialise());
89 // Check that unserialise() throws suitable error for bad serialisation.
90 // The easy case to test is extra junk after the serialised weight.
91 try {
92 unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
93 FAIL_TEST(name << " did not throw for unserialise with junk appended");
94 } catch (const Xapian::SerialisationError& e) {
95 // Check the exception message contains the weighting scheme name
96 // (regression test for TradWeight's exception saying "BM25").
97 string target = name + CONST_STRLEN("Xapian::");
98 TEST(e.get_msg().find(target) != string::npos);
102 // W Should be the class name.
104 // DEFAULT should be a parenthesised parameter list to explicitly construct
105 // an object of class W with the documented default parameters.
107 // OTHER should be a parenthesised parameter list to construct an object with
108 // non-default parameters.
109 #define TEST_WEIGHT_CLASS(W, DEFAULT, OTHER) \
110 test_weight_class<W>(#W, W DEFAULT, W OTHER)
112 /// Test serialisation and introspection of built-in weighting schemes.
113 DEFINE_TESTCASE(weightserialisation1, !backend) {
114 // Parameter-free weighting schemes.
115 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::BoolWeight);
116 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::CoordWeight);
117 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DLHWeight);
118 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DPHWeight);
119 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DiceCoeffWeight);
121 // Parameterised weighting schemes.
122 TEST_WEIGHT_CLASS(Xapian::TradWeight, (1.0), (2.0));
123 TEST_WEIGHT_CLASS(Xapian::BM25Weight,
124 (1, 0, 1, 0.5, 0.5),
125 (1, 0.5, 1, 0.5, 0.5));
126 TEST_WEIGHT_CLASS(Xapian::BM25PlusWeight,
127 (1, 0, 1, 0.5, 0.5, 1.0),
128 (1, 0, 1, 0.5, 0.5, 2.0));
129 TEST_WEIGHT_CLASS(Xapian::TfIdfWeight, ("ntn"), ("bpn"));
130 TEST_WEIGHT_CLASS(Xapian::InL2Weight, (1.0), (2.0));
131 TEST_WEIGHT_CLASS(Xapian::IfB2Weight, (1.0), (2.0));
132 TEST_WEIGHT_CLASS(Xapian::IneB2Weight, (1.0), (2.0));
133 TEST_WEIGHT_CLASS(Xapian::BB2Weight, (1.0), (2.0));
134 TEST_WEIGHT_CLASS(Xapian::PL2Weight, (1.0), (2.0));
135 TEST_WEIGHT_CLASS(Xapian::PL2PlusWeight,
136 (1.0, 0.8),
137 (2.0, 0.9));
138 TEST_WEIGHT_CLASS(Xapian::LMWeight,
139 (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0.7, 2000.0),
140 (0.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0.7));
143 /// Basic test of using weighting schemes.
144 DEFINE_TESTCASE(weight1, backend) {
145 Xapian::Database db(get_database("etext"));
146 Xapian::Enquire enquire(db);
147 Xapian::Enquire enquire_scaled(db);
148 auto term = "robinson";
149 Xapian::Query q{term};
150 enquire.set_query(q);
151 enquire_scaled.set_query(q * 15.0);
152 auto expected_matches = db.get_termfreq(term);
153 auto helper = [&](const Xapian::Weight& weight,
154 string_view name,
155 string_view params) {
156 tout << name << '(' << params << ")\n";
157 enquire.set_weighting_scheme(weight);
158 enquire_scaled.set_weighting_scheme(weight);
159 Xapian::MSet mset = enquire.get_mset(0, expected_matches + 1);
160 TEST_EQUAL(mset.size(), expected_matches);
161 if (name == "Xapian::BoolWeight") {
162 /* All weights should be zero. */
163 TEST_EQUAL(mset[0].get_weight(), 0.0);
164 TEST_EQUAL(mset.back().get_weight(), 0.0);
165 } else if (name == "Xapian::CoordWeight") {
166 /* All weights should be 1 for a single term query. */
167 TEST_EQUAL(mset[0].get_weight(), 1.0);
168 TEST_EQUAL(mset.back().get_weight(), 1.0);
169 } else if (!params.empty()) {
170 /* All weights should be equal with these particular parameters. */
171 TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
172 TEST_EQUAL(mset[0].get_weight(), mset.back().get_weight());
173 } else {
174 TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
175 TEST_NOT_EQUAL(mset[0].get_weight(), mset.back().get_weight());
177 Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches);
178 TEST_EQUAL(mset_scaled.size(), expected_matches);
179 for (Xapian::doccount i = 0; i < expected_matches; ++i) {
180 TEST_EQUAL_DOUBLE(mset_scaled[i].get_weight(),
181 mset[i].get_weight() * 15.0);
184 #define TEST_WEIGHTING_SCHEME(W, ...) helper(W(__VA_ARGS__), #W, #__VA_ARGS__)
185 TEST_WEIGHTING_SCHEME(Xapian::BoolWeight);
186 TEST_WEIGHTING_SCHEME(Xapian::CoordWeight);
187 TEST_WEIGHTING_SCHEME(Xapian::DLHWeight);
188 TEST_WEIGHTING_SCHEME(Xapian::DPHWeight);
189 TEST_WEIGHTING_SCHEME(Xapian::DiceCoeffWeight);
190 TEST_WEIGHTING_SCHEME(Xapian::TradWeight);
191 TEST_WEIGHTING_SCHEME(Xapian::BM25Weight);
192 TEST_WEIGHTING_SCHEME(Xapian::BM25PlusWeight);
193 TEST_WEIGHTING_SCHEME(Xapian::TfIdfWeight);
194 TEST_WEIGHTING_SCHEME(Xapian::InL2Weight);
195 TEST_WEIGHTING_SCHEME(Xapian::IfB2Weight);
196 TEST_WEIGHTING_SCHEME(Xapian::IneB2Weight);
197 TEST_WEIGHTING_SCHEME(Xapian::BB2Weight);
198 TEST_WEIGHTING_SCHEME(Xapian::PL2Weight);
199 TEST_WEIGHTING_SCHEME(Xapian::PL2PlusWeight);
200 TEST_WEIGHTING_SCHEME(Xapian::LMWeight);
201 // Regression test for bug fixed in 1.2.4.
202 TEST_WEIGHTING_SCHEME(Xapian::BM25Weight, 0, 0, 0, 0, 1);
203 /* As mentioned in the documentation, when parameter k is 0, wdf and
204 * document length don't affect the weights. Regression test for bug fixed
205 * in 1.2.4.
207 TEST_WEIGHTING_SCHEME(Xapian::TradWeight, 0);
208 #undef TEST_WEIGHTING_SCHEME
211 /** Regression test for bug fixed in 1.0.5.
213 * This test would fail under valgrind because it used an uninitialised value.
215 DEFINE_TESTCASE(bm25weight1, backend) {
216 Xapian::Enquire enquire(get_database("apitest_simpledata"));
217 enquire.set_weighting_scheme(Xapian::BM25Weight(1, 25, 1, 0.01, 0.5));
218 enquire.set_query(Xapian::Query("word"));
220 Xapian::MSet mset = enquire.get_mset(0, 25);
223 // Test parameter combinations which should be unaffected by doclength.
224 DEFINE_TESTCASE(bm25weight4, backend) {
225 Xapian::Database db = get_database("apitest_simpledata");
226 Xapian::Enquire enquire(db);
227 enquire.set_query(Xapian::Query("paragraph"));
228 Xapian::MSet mset;
230 enquire.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
231 mset = enquire.get_mset(0, 10);
232 TEST_EQUAL(mset.size(), 5);
233 // Expect: wdf has an effect on weight, but doclen doesn't.
234 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
235 TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
236 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
237 TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
239 enquire.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
240 mset = enquire.get_mset(0, 10);
241 TEST_EQUAL(mset.size(), 5);
242 // Expect: neither wdf nor doclen affects weight.
243 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
246 /// Test non-zero k2 with zero k1.
247 // Regression test for bug fixed in 1.2.17 and 1.3.2.
248 DEFINE_TESTCASE(bm25weight5, backend) {
249 Xapian::Database db = get_database("apitest_simpledata");
250 Xapian::Enquire enquire(db);
251 enquire.set_query(Xapian::Query("paragraph"));
252 Xapian::MSet mset;
254 enquire.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
255 mset = enquire.get_mset(0, 10);
256 TEST_EQUAL(mset.size(), 5);
257 // Expect: wdf has no effect on weight; shorter docs rank higher.
258 mset_expect_order(mset, 3, 5, 1, 4, 2);
259 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[1].get_weight());
260 TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
261 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
262 TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
265 // Test parameter combinations which should be unaffected by doclength.
266 DEFINE_TESTCASE(bm25plusweight2, backend) {
267 Xapian::Database db = get_database("apitest_simpledata");
268 Xapian::Enquire enquire(db);
269 enquire.set_query(Xapian::Query("paragraph"));
270 Xapian::MSet mset;
272 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
273 mset = enquire.get_mset(0, 10);
274 TEST_EQUAL(mset.size(), 5);
275 // Expect: wdf has an effect on weight, but doclen doesn't.
276 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
277 TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
278 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
279 TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
281 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
282 mset = enquire.get_mset(0, 10);
283 TEST_EQUAL(mset.size(), 5);
284 // Expect: neither wdf nor doclen affects weight.
285 TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
288 // Regression test for a mistake corrected in the BM25+ implementation.
289 DEFINE_TESTCASE(bm25plusweight3, backend) {
290 Xapian::Database db = get_database("apitest_simpledata");
291 Xapian::Enquire enquire(db);
292 enquire.set_query(Xapian::Query("paragraph"));
293 Xapian::MSet mset;
295 enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
296 mset = enquire.get_mset(0, 10);
297 TEST_EQUAL(mset.size(), 5);
299 // The value of each doc weight calculated manually from the BM25+ formulae
300 // by using the respective document statistics.
301 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.7920796567487473);
302 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.7846980783848447);
303 TEST_EQUAL_DOUBLE(mset[2].get_weight(), 0.7558817623365934);
304 TEST_EQUAL_DOUBLE(mset[3].get_weight(), 0.7210119356168847);
305 TEST_EQUAL_DOUBLE(mset[4].get_weight(), 0.7210119356168847);
309 // Test for invalid values of c.
310 DEFINE_TESTCASE(inl2weight2, !backend) {
311 // InvalidArgumentError should be thrown if the parameter c is invalid.
312 TEST_EXCEPTION(Xapian::InvalidArgumentError,
313 Xapian::InL2Weight wt(-2.0));
315 TEST_EXCEPTION(Xapian::InvalidArgumentError,
316 Xapian::InL2Weight wt2(0.0));
319 // Feature tests for Inl2Weight
320 DEFINE_TESTCASE(inl2weight3, backend) {
321 Xapian::Database db = get_database("apitest_simpledata");
322 Xapian::Enquire enquire(db);
323 Xapian::Query query("banana");
325 enquire.set_query(query);
326 enquire.set_weighting_scheme(Xapian::InL2Weight(2.0));
328 Xapian::MSet mset1;
329 mset1 = enquire.get_mset(0, 10);
330 TEST_EQUAL(mset1.size(), 1);
331 mset_expect_order(mset1, 6);
333 /* The value has been calculated in the python interpreter by looking at the
334 * database statistics. */
335 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.559711143842063);
338 // Test for invalid values of c.
339 DEFINE_TESTCASE(ifb2weight2, !backend) {
340 // InvalidArgumentError should be thrown if the parameter c is invalid.
341 TEST_EXCEPTION(Xapian::InvalidArgumentError,
342 Xapian::IfB2Weight wt(-2.0));
344 TEST_EXCEPTION(Xapian::InvalidArgumentError,
345 Xapian::IfB2Weight wt2(0.0));
348 // Feature test
349 DEFINE_TESTCASE(ifb2weight3, backend) {
350 Xapian::Database db = get_database("apitest_simpledata");
351 Xapian::Enquire enquire(db);
352 Xapian::Query query("banana");
354 enquire.set_query(query);
355 enquire.set_weighting_scheme(Xapian::IfB2Weight(2.0));
357 Xapian::MSet mset1;
358 mset1 = enquire.get_mset(0, 10);
359 TEST_EQUAL(mset1.size(), 1);
361 /* The value of the weight has been manually calculated using the statistics
362 * of the test database. */
363 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.119422287684126);
366 // Test for invalid values of c.
367 DEFINE_TESTCASE(ineb2weight2, !backend) {
368 // InvalidArgumentError should be thrown if parameter c is invalid.
369 TEST_EXCEPTION(Xapian::InvalidArgumentError,
370 Xapian::IneB2Weight wt(-2.0));
372 TEST_EXCEPTION(Xapian::InvalidArgumentError,
373 Xapian::IneB2Weight wt2(0.0));
376 // Feature test.
377 DEFINE_TESTCASE(ineb2weight3, backend) {
378 Xapian::Database db = get_database("apitest_simpledata");
379 Xapian::Enquire enquire(db);
380 Xapian::Query query("paragraph");
381 enquire.set_query(query);
382 enquire.set_weighting_scheme(Xapian::IneB2Weight(2.0));
384 Xapian::MSet mset1;
385 mset1 = enquire.get_mset(0, 10);
386 TEST_EQUAL(mset1.size(), 5);
388 // The third document in the database is 4th in the ranking.
389 /* The weight value has been manually calculated by using the statistics
390 * of the test database. */
391 TEST_EQUAL_DOUBLE(mset1[4].get_weight(), 0.61709730297692400036);
394 // Test for invalid values of c.
395 DEFINE_TESTCASE(bb2weight2, !backend) {
396 // InvalidArgumentError should be thrown if the parameter c is invalid.
397 TEST_EXCEPTION(Xapian::InvalidArgumentError,
398 Xapian::BB2Weight wt(-2.0));
400 TEST_EXCEPTION(Xapian::InvalidArgumentError,
401 Xapian::BB2Weight wt2(0.0));
404 // Feature test
405 DEFINE_TESTCASE(bb2weight3, backend) {
406 Xapian::Database db = get_database("apitest_simpledata");
407 Xapian::Enquire enquire(db);
408 Xapian::Query query("paragraph");
410 enquire.set_query(query);
411 enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
413 Xapian::MSet mset1;
414 mset1 = enquire.get_mset(0, 10);
415 TEST_EQUAL(mset1.size(), 5);
416 /* The third document in the database has the highest weight and is the
417 * first in the mset. */
418 // Value calculated manually by using the statistics of the test database.
419 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.6823696969784483);
421 // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
422 // were applying the factor to the upper bound twice).
423 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 1.0 / 1024));
424 enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
426 Xapian::MSet mset3;
427 mset3 = enquire.get_mset(0, 10);
428 TEST_EQUAL(mset3.size(), 5);
430 for (int i = 0; i < 5; ++i) {
431 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight() * 1024);
435 // Regression test: we used to calculate log2(0) when there was only one doc.
436 DEFINE_TESTCASE(bb2weight4, backend) {
437 Xapian::Database db = get_database("apitest_onedoc");
438 Xapian::Enquire enquire(db);
439 Xapian::Query query("word");
441 enquire.set_query(query);
442 enquire.set_weighting_scheme(Xapian::BB2Weight());
444 Xapian::MSet mset1;
445 mset1 = enquire.get_mset(0, 10);
446 TEST_EQUAL(mset1.size(), 1);
447 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.431020621347435);
450 // Feature test.
451 DEFINE_TESTCASE(dlhweight1, backend) {
452 Xapian::Database db = get_database("apitest_simpledata");
453 Xapian::Enquire enquire(db);
454 Xapian::Query query("a");
456 enquire.set_query(query);
457 enquire.set_weighting_scheme(Xapian::DLHWeight());
459 Xapian::MSet mset1;
460 mset1 = enquire.get_mset(0, 10);
461 TEST_EQUAL(mset1.size(), 3);
462 mset_expect_order(mset1, 3, 1, 2);
463 // Weights calculated manually using stats from the database.
464 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.0046477754371292362);
465 TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.97621929514640352757);
466 // The following weight would be negative but gets clamped to 0.
467 TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.0);
470 static void
471 gen_wdf_eq_doclen_db(Xapian::WritableDatabase& db, const string&)
473 Xapian::Document doc;
474 doc.add_term("solo", 37);
475 db.add_document(doc);
478 // Test wdf == doclen.
479 DEFINE_TESTCASE(dlhweight3, backend) {
480 Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
481 Xapian::Enquire enquire(db);
482 Xapian::Query query("solo");
484 enquire.set_query(query);
485 enquire.set_weighting_scheme(Xapian::DLHWeight());
487 Xapian::MSet mset1;
488 mset1 = enquire.get_mset(0, 10);
489 TEST_EQUAL(mset1.size(), 1);
490 // Weight gets clamped to zero.
491 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
494 // Test for invalid values of c.
495 DEFINE_TESTCASE(pl2weight2, !backend) {
496 // InvalidArgumentError should be thrown if parameter c is invalid.
497 TEST_EXCEPTION(Xapian::InvalidArgumentError,
498 Xapian::PL2Weight wt(-2.0));
501 // Feature Test.
502 DEFINE_TESTCASE(pl2weight3, backend) {
503 Xapian::Database db = get_database("apitest_simpledata");
504 Xapian::Enquire enquire(db);
505 Xapian::Query query("paragraph");
506 enquire.set_query(query);
507 Xapian::MSet mset;
509 enquire.set_weighting_scheme(Xapian::PL2Weight(2.0));
510 mset = enquire.get_mset(0, 10);
511 TEST_EQUAL(mset.size(), 5);
512 // Expected weight difference calculated in extended precision using stats
513 // from the test database.
514 TEST_EQUAL_DOUBLE(mset[2].get_weight(),
515 mset[3].get_weight() + 0.0086861771701328694);
518 // Test for invalid values of parameters, c and delta.
519 DEFINE_TESTCASE(pl2plusweight2, !backend) {
520 // InvalidArgumentError should be thrown if parameter c is invalid.
521 TEST_EXCEPTION(Xapian::InvalidArgumentError,
522 Xapian::PL2PlusWeight wt(-2.0, 0.9));
524 // InvalidArgumentError should be thrown if parameter delta is invalid.
525 TEST_EXCEPTION(Xapian::InvalidArgumentError,
526 Xapian::PL2PlusWeight wt(1.0, -1.9));
529 // Feature Test 1 for PL2PlusWeight.
530 DEFINE_TESTCASE(pl2plusweight4, backend) {
531 Xapian::Database db = get_database("apitest_simpledata");
532 Xapian::Enquire enquire(db);
533 enquire.set_query(Xapian::Query("to"));
534 Xapian::MSet mset;
536 enquire.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
537 mset = enquire.get_mset(0, 10);
538 TEST_EQUAL(mset.size(), 3);
539 // Expected weight difference calculated in Python using stats from the
540 // test database.
541 TEST_EQUAL_DOUBLE(mset[1].get_weight(),
542 mset[2].get_weight() + 0.016760925252262027);
545 // Feature Test 2 for PL2PlusWeight
546 DEFINE_TESTCASE(pl2plusweight5, backend) {
547 Xapian::Database db = get_database("apitest_simpledata");
548 Xapian::Enquire enquire(db);
549 Xapian::Query query("word");
550 enquire.set_query(query);
551 Xapian::MSet mset;
553 enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
554 mset = enquire.get_mset(0, 10);
555 // Expect MSet contains two documents having query "word".
556 TEST_EQUAL(mset.size(), 2);
557 // Expect Document 2 has higher weight than document 4 because
558 // "word" appears more no. of times in document 2 than document 4.
559 mset_expect_order(mset, 2, 4);
562 // Feature test
563 DEFINE_TESTCASE(dphweight1, backend) {
564 Xapian::Database db = get_database("apitest_simpledata");
565 Xapian::Enquire enquire(db);
566 Xapian::Query query("paragraph");
568 enquire.set_query(query);
569 enquire.set_weighting_scheme(Xapian::DPHWeight());
571 Xapian::MSet mset1;
572 mset1 = enquire.get_mset(0, 10);
573 TEST_EQUAL(mset1.size(), 5);
574 /* The weight has been calculated manually by using the statistics of the
575 * test database. */
576 TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
579 // Test wdf == doclen.
580 DEFINE_TESTCASE(dphweight3, backend) {
581 Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
582 Xapian::Enquire enquire(db);
583 Xapian::Query query("solo");
585 enquire.set_query(query);
586 enquire.set_weighting_scheme(Xapian::DPHWeight());
588 Xapian::MSet mset1;
589 mset1 = enquire.get_mset(0, 10);
590 TEST_EQUAL(mset1.size(), 1);
591 // Weight gets clamped to zero.
592 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
595 // Test for various cases of normalization string.
596 DEFINE_TESTCASE(tfidfweight1, !backend) {
597 // InvalidArgumentError should be thrown if normalization string is invalid
598 TEST_EXCEPTION(Xapian::InvalidArgumentError,
599 Xapian::TfIdfWeight b("JOHN_LENNON"));
601 TEST_EXCEPTION(Xapian::InvalidArgumentError,
602 Xapian::TfIdfWeight b("LOL"));
604 /* Normalization string should be set to "ntn" by constructor if none is
605 given. */
606 Xapian::TfIdfWeight weight2;
607 TEST_EQUAL(weight2.serialise(), Xapian::TfIdfWeight("ntn").serialise());
609 TEST_EXCEPTION(Xapian::InvalidArgumentError,
610 Xapian::Weight::create("tfidf FUN NONE NONE"));
612 TEST_EXCEPTION(Xapian::InvalidArgumentError,
613 Xapian::Weight::create("tfidf NONE FUN NONE"));
615 TEST_EXCEPTION(Xapian::InvalidArgumentError,
616 Xapian::Weight::create("tfidf NONE NONE FUN"));
618 TEST_EXCEPTION(Xapian::InvalidArgumentError,
619 Xapian::Weight::create("tfidf NONE"));
621 TEST_EXCEPTION(Xapian::InvalidArgumentError,
622 Xapian::Weight::create("tfidf NONE NONE"));
625 // Feature tests for various normalization functions.
626 DEFINE_TESTCASE(tfidfweight3, backend) {
627 Xapian::Database db = get_database("apitest_simpledata");
628 Xapian::Enquire enquire(db);
629 Xapian::Query query("word");
630 Xapian::MSet mset;
632 // Check for "ntn" when termfreq != N
633 enquire.set_query(query);
634 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
635 mset = enquire.get_mset(0, 10);
636 TEST_EQUAL(mset.size(), 2);
637 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
638 mset_expect_order(mset, 2, 4);
639 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
641 // Check that wqf is taken into account.
642 enquire.set_query(Xapian::Query("word", 2));
643 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
644 Xapian::MSet mset2 = enquire.get_mset(0, 10);
645 TEST_EQUAL(mset2.size(), 2);
646 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
647 mset_expect_order(mset2, 2, 4);
648 // wqf is 2, so weights should be doubled.
649 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
650 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
652 // check for "nfn" when termfreq != N
653 enquire.set_query(query);
654 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nfn"));
655 mset = enquire.get_mset(0, 10);
656 TEST_EQUAL(mset.size(), 2);
657 mset_expect_order(mset, 2, 4);
658 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
660 // check for "nsn" when termfreq != N
661 enquire.set_query(query);
662 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nsn"));
663 mset = enquire.get_mset(0, 10);
664 TEST_EQUAL(mset.size(), 2);
665 mset_expect_order(mset, 2, 4);
666 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
668 // Check for "bnn" and for both branches of 'b'.
669 enquire.set_query(Xapian::Query("test"));
670 enquire.set_weighting_scheme(Xapian::TfIdfWeight("bnn"));
671 mset = enquire.get_mset(0, 10);
672 TEST_EQUAL(mset.size(), 1);
673 mset_expect_order(mset, 1);
674 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
676 // Check for "lnn" and for both branches of 'l'.
677 enquire.set_query(Xapian::Query("word"));
678 enquire.set_weighting_scheme(Xapian::TfIdfWeight("lnn"));
679 mset = enquire.get_mset(0, 10);
680 TEST_EQUAL(mset.size(), 2);
681 mset_expect_order(mset, 2, 4);
682 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
683 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0); // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
685 // Check for "snn"
686 enquire.set_query(Xapian::Query("paragraph"));
687 enquire.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
688 mset = enquire.get_mset(0, 10);
689 TEST_EQUAL(mset.size(), 5);
690 mset_expect_order(mset, 2, 1, 4, 3, 5);
691 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
692 TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
694 // Check for "ntn" when termfreq=N
695 enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "t"
696 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
697 mset = enquire.get_mset(0, 10);
698 TEST_EQUAL(mset.size(), 6);
699 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
700 for (int i = 0; i < 6; ++i) {
701 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
704 // Check for "npn" and for both branches of 'p'
705 enquire.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "p"
706 enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
707 mset = enquire.get_mset(0, 10);
708 TEST_EQUAL(mset.size(), 6);
709 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
710 for (int i = 0; i < 6; ++i) {
711 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
714 // Check for "Lnn".
715 enquire.set_query(Xapian::Query("word"));
716 enquire.set_weighting_scheme(Xapian::TfIdfWeight("Lnn"));
717 mset = enquire.get_mset(0, 10);
718 TEST_EQUAL(mset.size(), 2);
719 mset_expect_order(mset, 2, 4);
720 TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
721 TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
723 enquire.set_query(Xapian::Query("word"));
724 enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
725 mset = enquire.get_mset(0, 10);
726 TEST_EQUAL(mset.size(), 2);
727 mset_expect_order(mset, 2, 4);
728 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
729 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
731 // Check for "mnn".
732 enquire.set_query(Xapian::Query("word"));
733 enquire.set_weighting_scheme(Xapian::TfIdfWeight("mnn"));
734 mset = enquire.get_mset(0, 10);
735 TEST_EQUAL(mset.size(), 2);
736 mset_expect_order(mset, 2, 4);
737 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
738 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
740 // Check for "ann".
741 enquire.set_query(Xapian::Query("word"));
742 enquire.set_weighting_scheme(Xapian::TfIdfWeight("ann"));
743 mset = enquire.get_mset(0, 10);
744 TEST_EQUAL(mset.size(), 2);
745 mset_expect_order(mset, 2, 4);
746 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
747 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
749 // Check for NONE, TFIDF, NONE when termfreq != N
750 enquire.set_query(query);
751 enquire.set_weighting_scheme(
752 Xapian::TfIdfWeight(
753 Xapian::TfIdfWeight::wdf_norm::NONE,
754 Xapian::TfIdfWeight::idf_norm::TFIDF,
755 Xapian::TfIdfWeight::wt_norm::NONE));
756 mset = enquire.get_mset(0, 10);
757 TEST_EQUAL(mset.size(), 2);
758 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
759 mset_expect_order(mset, 2, 4);
760 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
762 // Check that wqf is taken into account.
763 enquire.set_query(Xapian::Query("word", 2));
764 mset2 = enquire.get_mset(0, 10);
765 TEST_EQUAL(mset2.size(), 2);
766 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
767 mset_expect_order(mset2, 2, 4);
768 // wqf is 2, so weights should be doubled.
769 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
770 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
772 // check for NONE, FREQ, NONE when termfreq != N
773 enquire.set_query(query);
774 enquire.set_weighting_scheme(
775 Xapian::TfIdfWeight(
776 Xapian::TfIdfWeight::wdf_norm::NONE,
777 Xapian::TfIdfWeight::idf_norm::FREQ,
778 Xapian::TfIdfWeight::wt_norm::NONE));
779 mset = enquire.get_mset(0, 10);
780 TEST_EQUAL(mset.size(), 2);
781 mset_expect_order(mset, 2, 4);
782 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
784 // check for NONE, SQUARE, NONE when termfreq != N
785 enquire.set_query(query);
786 enquire.set_weighting_scheme(
787 Xapian::TfIdfWeight(
788 Xapian::TfIdfWeight::wdf_norm::NONE,
789 Xapian::TfIdfWeight::idf_norm::SQUARE,
790 Xapian::TfIdfWeight::wt_norm::NONE));
791 mset = enquire.get_mset(0, 10);
792 TEST_EQUAL(mset.size(), 2);
793 mset_expect_order(mset, 2, 4);
794 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
796 // Check for BOOLEAN, NONE, NONE and for both branches of BOOLEAN.
797 enquire.set_query(Xapian::Query("test"));
798 enquire.set_weighting_scheme(
799 Xapian::TfIdfWeight(
800 Xapian::TfIdfWeight::wdf_norm::BOOLEAN,
801 Xapian::TfIdfWeight::idf_norm::NONE,
802 Xapian::TfIdfWeight::wt_norm::NONE));
803 mset = enquire.get_mset(0, 10);
804 TEST_EQUAL(mset.size(), 1);
805 mset_expect_order(mset, 1);
806 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
808 // Check for LOG, NONE, NONE and for both branches of LOG.
809 enquire.set_query(Xapian::Query("word"));
810 enquire.set_weighting_scheme(
811 Xapian::TfIdfWeight(
812 Xapian::TfIdfWeight::wdf_norm::LOG,
813 Xapian::TfIdfWeight::idf_norm::NONE,
814 Xapian::TfIdfWeight::wt_norm::NONE));
815 mset = enquire.get_mset(0, 10);
816 TEST_EQUAL(mset.size(), 2);
817 mset_expect_order(mset, 2, 4);
818 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0));
819 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0);
821 // Check for SQUARE, NONE, NONE.
822 enquire.set_query(Xapian::Query("paragraph"));
823 enquire.set_weighting_scheme(
824 Xapian::TfIdfWeight(
825 Xapian::TfIdfWeight::wdf_norm::SQUARE,
826 Xapian::TfIdfWeight::idf_norm::NONE,
827 Xapian::TfIdfWeight::wt_norm::NONE)); // idf=1 and tfn=tf*tf
828 mset = enquire.get_mset(0, 10);
829 TEST_EQUAL(mset.size(), 5);
830 mset_expect_order(mset, 2, 1, 4, 3, 5);
831 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
832 TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
834 // Check for NONE, TFIDF, NONE when termfreq=N
835 enquire.set_query(Xapian::Query("this"));
836 // N=termfreq and so idfn=0 for TFIDF
837 enquire.set_weighting_scheme(
838 Xapian::TfIdfWeight(
839 Xapian::TfIdfWeight::wdf_norm::NONE,
840 Xapian::TfIdfWeight::idf_norm::TFIDF,
841 Xapian::TfIdfWeight::wt_norm::NONE));
842 mset = enquire.get_mset(0, 10);
843 TEST_EQUAL(mset.size(), 6);
844 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
845 for (int i = 0; i < 6; ++i) {
846 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
849 // Check for NONE, PROB, NONE and for both branches of PROB
850 enquire.set_query(Xapian::Query("this"));
851 // N=termfreq and so idfn=0 for PROB
852 enquire.set_weighting_scheme(
853 Xapian::TfIdfWeight(
854 Xapian::TfIdfWeight::wdf_norm::NONE,
855 Xapian::TfIdfWeight::idf_norm::PROB,
856 Xapian::TfIdfWeight::wt_norm::NONE));
857 mset = enquire.get_mset(0, 10);
858 TEST_EQUAL(mset.size(), 6);
859 mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
860 for (int i = 0; i < 6; ++i) {
861 TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
864 enquire.set_query(Xapian::Query("word"));
865 enquire.set_weighting_scheme(
866 Xapian::TfIdfWeight(
867 Xapian::TfIdfWeight::wdf_norm::NONE,
868 Xapian::TfIdfWeight::idf_norm::PROB,
869 Xapian::TfIdfWeight::wt_norm::NONE));
870 mset = enquire.get_mset(0, 10);
871 TEST_EQUAL(mset.size(), 2);
872 mset_expect_order(mset, 2, 4);
873 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
874 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
876 // Check for LOG_AVERAGE, NONE, NONE.
877 enquire.set_query(Xapian::Query("word"));
878 enquire.set_weighting_scheme(
879 Xapian::TfIdfWeight(
880 Xapian::TfIdfWeight::wdf_norm::LOG_AVERAGE,
881 Xapian::TfIdfWeight::idf_norm::NONE,
882 Xapian::TfIdfWeight::wt_norm::NONE));
883 mset = enquire.get_mset(0, 10);
884 TEST_EQUAL(mset.size(), 2);
885 mset_expect_order(mset, 2, 4);
886 TEST_EQUAL_DOUBLE(mset[0].get_weight(),
887 (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
888 TEST_EQUAL_DOUBLE(mset[1].get_weight(),
889 (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
891 // Check for AUG_LOG, NONE, NONE.
892 enquire.set_weighting_scheme(
893 Xapian::TfIdfWeight(
894 Xapian::TfIdfWeight::wdf_norm::AUG_LOG,
895 Xapian::TfIdfWeight::idf_norm::NONE,
896 Xapian::TfIdfWeight::wt_norm::NONE));
897 mset = enquire.get_mset(0, 10);
898 TEST_EQUAL(mset.size(), 2);
899 mset_expect_order(mset, 2, 4);
900 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.2 + 0.8 * log(1.0 + 8));
901 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.2 + 0.8 * log(1.0 + 1));
903 // Check for NONE, GLOBAL_FREQ, NONE.
904 enquire.set_weighting_scheme(
905 Xapian::TfIdfWeight(
906 Xapian::TfIdfWeight::wdf_norm::NONE,
907 Xapian::TfIdfWeight::idf_norm::GLOBAL_FREQ,
908 Xapian::TfIdfWeight::wt_norm::NONE));
909 mset = enquire.get_mset(0, 10);
910 TEST_EQUAL(mset.size(), 2);
911 mset_expect_order(mset, 2, 4);
912 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2));
913 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2));
915 // Check for SQRT, NONE, NONE.
916 enquire.set_weighting_scheme(
917 Xapian::TfIdfWeight(
918 Xapian::TfIdfWeight::wdf_norm::SQRT,
919 Xapian::TfIdfWeight::idf_norm::NONE,
920 Xapian::TfIdfWeight::wt_norm::NONE));
921 mset = enquire.get_mset(0, 10);
922 TEST_EQUAL(mset.size(), 2);
923 mset_expect_order(mset, 2, 4);
924 TEST_EQUAL_DOUBLE(mset[0].get_weight(), sqrt(8 - 0.5) + 1);
925 TEST_EQUAL_DOUBLE(mset[1].get_weight(), sqrt(1 - 0.5) + 1);
927 // Check for NONE, LOG_GLOBAL_FREQ, NONE.
928 enquire.set_weighting_scheme(
929 Xapian::TfIdfWeight(
930 Xapian::TfIdfWeight::wdf_norm::NONE,
931 Xapian::TfIdfWeight::idf_norm::LOG_GLOBAL_FREQ,
932 Xapian::TfIdfWeight::wt_norm::NONE));
933 mset = enquire.get_mset(0, 10);
934 TEST_EQUAL(mset.size(), 2);
935 mset_expect_order(mset, 2, 4);
936 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log(9.0 / 2 + 1));
937 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log(9.0 / 2 + 1));
939 // Check for NONE, INCREMENTED_GLOBAL_FREQ, NONE.
940 enquire.set_weighting_scheme(
941 Xapian::TfIdfWeight(
942 Xapian::TfIdfWeight::wdf_norm::NONE,
943 Xapian::TfIdfWeight::idf_norm::INCREMENTED_GLOBAL_FREQ,
944 Xapian::TfIdfWeight::wt_norm::NONE));
945 mset = enquire.get_mset(0, 10);
946 TEST_EQUAL(mset.size(), 2);
947 mset_expect_order(mset, 2, 4);
948 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2 + 1));
949 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2 + 1));
951 // Check for NONE, SQRT_GLOBAL_FREQ, NONE.
952 enquire.set_weighting_scheme(
953 Xapian::TfIdfWeight(
954 Xapian::TfIdfWeight::wdf_norm::NONE,
955 Xapian::TfIdfWeight::idf_norm::SQRT_GLOBAL_FREQ,
956 Xapian::TfIdfWeight::wt_norm::NONE));
957 mset = enquire.get_mset(0, 10);
958 TEST_EQUAL(mset.size(), 2);
959 mset_expect_order(mset, 2, 4);
960 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * sqrt(9.0 / 2 - 0.9));
961 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * sqrt(9.0 / 2 - 0.9));
963 // Check for AUG_AVERAGE, NONE, NONE.
964 enquire.set_weighting_scheme(
965 Xapian::TfIdfWeight(
966 Xapian::TfIdfWeight::wdf_norm::AUG_AVERAGE,
967 Xapian::TfIdfWeight::idf_norm::NONE,
968 Xapian::TfIdfWeight::wt_norm::NONE));
969 mset = enquire.get_mset(0, 10);
970 TEST_EQUAL(mset.size(), 2);
971 mset_expect_order(mset, 2, 4);
972 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.9 + 0.1 * (8.0 / (81.0 / 56.0)));
973 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.9 + 0.1 * (1.0 / (31.0 / 26.0)));
975 // Check for MAX, NONE, NONE.
976 enquire.set_weighting_scheme(
977 Xapian::TfIdfWeight(
978 Xapian::TfIdfWeight::wdf_norm::MAX,
979 Xapian::TfIdfWeight::idf_norm::NONE,
980 Xapian::TfIdfWeight::wt_norm::NONE));
981 mset = enquire.get_mset(0, 10);
982 TEST_EQUAL(mset.size(), 2);
983 mset_expect_order(mset, 2, 4);
984 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
985 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
987 // Check for AUG, NONE, NONE.
988 enquire.set_weighting_scheme(
989 Xapian::TfIdfWeight(
990 Xapian::TfIdfWeight::wdf_norm::AUG,
991 Xapian::TfIdfWeight::idf_norm::NONE,
992 Xapian::TfIdfWeight::wt_norm::NONE));
993 mset = enquire.get_mset(0, 10);
994 TEST_EQUAL(mset.size(), 2);
995 mset_expect_order(mset, 2, 4);
996 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
997 TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
1000 // Feature tests for pivoted normalization functions.
1001 DEFINE_TESTCASE(tfidfweight4, backend) {
1002 Xapian::Database db = get_database("apitest_simpledata");
1003 Xapian::Enquire enquire(db);
1004 Xapian::Query query("paragraph");
1005 Xapian::MSet mset;
1007 // Check for "PPn" normalization string.
1008 enquire.set_query(query);
1009 enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1010 mset = enquire.get_mset(0, 10);
1011 TEST_EQUAL(mset.size(), 5);
1012 // Shorter docs should ranker higher if wqf is equal among all the docs.
1013 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1014 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1016 // Check that wqf is taken into account.
1017 enquire.set_query(Xapian::Query("paragraph", 2));
1018 enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1019 Xapian::MSet mset2 = enquire.get_mset(0, 10);
1020 TEST_EQUAL(mset2.size(), 5);
1021 // wqf is 2, so weights should be doubled.
1022 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1023 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1025 // check for "nPn" which represents "xPx"
1026 enquire.set_query(Xapian::Query("word"));
1027 enquire.set_weighting_scheme(Xapian::TfIdfWeight("nPn", 0.2, 1.0));
1028 mset = enquire.get_mset(0, 10);
1029 TEST_EQUAL(mset.size(), 2);
1030 // Expect doc 2 with query "word" to have higher weight than doc 4.
1031 mset_expect_order(mset, 2, 4);
1033 // check for "Ptn" which represents "Pxx"
1034 enquire.set_query(Xapian::Query("word"));
1035 enquire.set_weighting_scheme(Xapian::TfIdfWeight("Ptn", 0.2, 1.0));
1036 mset = enquire.get_mset(0, 10);
1037 TEST_EQUAL(mset.size(), 2);
1038 // Expect doc 2 with query "word" to have higher weight than doc 4.
1039 mset_expect_order(mset, 2, 4);
1041 // Check for PIVOTED, PIVOTED, NONE normalization string.
1042 enquire.set_query(query);
1043 enquire.set_weighting_scheme(
1044 Xapian::TfIdfWeight(
1045 Xapian::TfIdfWeight::wdf_norm::PIVOTED,
1046 Xapian::TfIdfWeight::idf_norm::PIVOTED,
1047 Xapian::TfIdfWeight::wt_norm::NONE));
1048 mset = enquire.get_mset(0, 10);
1049 TEST_EQUAL(mset.size(), 5);
1050 // Shorter docs should ranker higher if wqf is equal among all the docs.
1051 TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1052 TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1054 // Check that wqf is taken into account.
1055 enquire.set_query(Xapian::Query("paragraph", 2));
1056 mset2 = enquire.get_mset(0, 10);
1057 TEST_EQUAL(mset2.size(), 5);
1058 // wqf is 2, so weights should be doubled.
1059 TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1060 TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1062 // check for NONE, PIVOTED, NONE
1063 enquire.set_query(Xapian::Query("word"));
1064 enquire.set_weighting_scheme(
1065 Xapian::TfIdfWeight(
1066 Xapian::TfIdfWeight::wdf_norm::NONE,
1067 Xapian::TfIdfWeight::idf_norm::PIVOTED,
1068 Xapian::TfIdfWeight::wt_norm::NONE));
1069 mset = enquire.get_mset(0, 10);
1070 TEST_EQUAL(mset.size(), 2);
1071 // Expect doc 2 with query "word" to have higher weight than doc 4.
1072 mset_expect_order(mset, 2, 4);
1074 // check for PIVOTED, TFIDF, NONE
1075 enquire.set_query(Xapian::Query("word"));
1076 enquire.set_weighting_scheme(
1077 Xapian::TfIdfWeight(
1078 Xapian::TfIdfWeight::wdf_norm::PIVOTED,
1079 Xapian::TfIdfWeight::idf_norm::TFIDF,
1080 Xapian::TfIdfWeight::wt_norm::NONE));
1081 mset = enquire.get_mset(0, 10);
1082 TEST_EQUAL(mset.size(), 2);
1083 // Expect doc 2 with query "word" to have higher weight than doc 4.
1084 mset_expect_order(mset, 2, 4);
1087 // Check that create_from_parameters() creates the correct object.
1088 DEFINE_TESTCASE(tfidfweight5, !backend) {
1089 auto wt_ptr = Xapian::Weight::create("tfidf NONE TFIDF NONE");
1090 auto wt = Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::NONE,
1091 Xapian::TfIdfWeight::idf_norm::TFIDF,
1092 Xapian::TfIdfWeight::wt_norm::NONE);
1093 TEST_EQUAL(wt_ptr->serialise(), wt.serialise());
1094 delete wt_ptr;
1096 auto wt_ptr2 = Xapian::Weight::create("tfidf SQRT PIVOTED NONE");
1097 auto wt2 = Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::SQRT,
1098 Xapian::TfIdfWeight::idf_norm::PIVOTED,
1099 Xapian::TfIdfWeight::wt_norm::NONE);
1100 TEST_EQUAL(wt_ptr2->serialise(), wt2.serialise());
1101 delete wt_ptr2;
1104 class CheckInitWeight : public Xapian::Weight {
1105 public:
1106 double factor;
1108 unsigned & zero_inits, & non_zero_inits;
1110 CheckInitWeight(unsigned &z, unsigned &n)
1111 : factor(-1.0), zero_inits(z), non_zero_inits(n) {
1112 need_stat(DOC_LENGTH);
1115 void init(double factor_) override {
1116 factor = factor_;
1117 if (factor == 0.0)
1118 ++zero_inits;
1119 else
1120 ++non_zero_inits;
1123 Weight* clone() const override {
1124 return new CheckInitWeight(zero_inits, non_zero_inits);
1127 double get_sumpart(Xapian::termcount, Xapian::termcount,
1128 Xapian::termcount, Xapian::termcount) const override {
1129 return 1.0;
1132 double get_maxpart() const override { return 1.0; }
1134 double get_sumextra(Xapian::termcount doclen,
1135 Xapian::termcount,
1136 Xapian::termcount) const override {
1137 return 1.0 / doclen;
1140 double get_maxextra() const override { return 1.0; }
1143 /// Regression test - check init() is called for the term-indep Weight obj.
1144 DEFINE_TESTCASE(checkinitweight1, backend && !multi && !remote) {
1145 Xapian::Database db = get_database("apitest_simpledata");
1146 Xapian::Enquire enquire(db);
1147 Xapian::Query q(Xapian::Query::OP_AND,
1148 Xapian::Query("this"), Xapian::Query("paragraph"));
1149 enquire.set_query(q);
1150 unsigned zero_inits = 0, non_zero_inits = 0;
1151 CheckInitWeight wt(zero_inits, non_zero_inits);
1152 enquire.set_weighting_scheme(wt);
1153 Xapian::MSet mset = enquire.get_mset(0, 3);
1154 TEST_EQUAL(zero_inits, 1);
1155 TEST_EQUAL(non_zero_inits, 2);
1158 class CheckStatsWeight : public Xapian::Weight {
1159 public:
1160 double factor = -1.0;
1162 Xapian::Database db;
1164 string term1;
1166 // When testing OP_SYNONYM, term2 is also set.
1167 // When testing OP_WILDCARD, term2 == "*".
1168 // When testing a repeated term, term2 == "=" for the first occurrence and
1169 // "_" for subsequent occurrences.
1170 mutable string term2;
1172 Xapian::termcount & sum;
1173 Xapian::termcount & sum_squares;
1175 mutable Xapian::termcount len_upper = 0;
1176 mutable Xapian::termcount len_lower = Xapian::termcount(-1);
1177 mutable Xapian::termcount uniqueterms_upper = 0;
1178 mutable Xapian::termcount uniqueterms_lower = Xapian::termcount(-1);
1179 mutable Xapian::termcount wdf_upper = 0;
1181 CheckStatsWeight(const Xapian::Database & db_,
1182 const string & term1_,
1183 const string & term2_,
1184 Xapian::termcount & sum_,
1185 Xapian::termcount & sum_squares_)
1186 : db(db_), term1(term1_), term2(term2_),
1187 sum(sum_), sum_squares(sum_squares_)
1189 need_stat(COLLECTION_SIZE);
1190 need_stat(RSET_SIZE);
1191 need_stat(AVERAGE_LENGTH);
1192 need_stat(TERMFREQ);
1193 need_stat(RELTERMFREQ);
1194 need_stat(QUERY_LENGTH);
1195 need_stat(WQF);
1196 need_stat(WDF);
1197 need_stat(DOC_LENGTH);
1198 need_stat(DOC_LENGTH_MIN);
1199 need_stat(DOC_LENGTH_MAX);
1200 need_stat(DB_DOC_LENGTH_MIN);
1201 need_stat(DB_DOC_LENGTH_MAX);
1202 need_stat(WDF_MAX);
1203 need_stat(COLLECTION_FREQ);
1204 need_stat(UNIQUE_TERMS);
1205 need_stat(UNIQUE_TERMS_MIN);
1206 need_stat(UNIQUE_TERMS_MAX);
1207 need_stat(DB_UNIQUE_TERMS_MIN);
1208 need_stat(DB_UNIQUE_TERMS_MAX);
1209 need_stat(TOTAL_LENGTH);
1210 need_stat(WDF_DOC_MAX);
1213 CheckStatsWeight(const Xapian::Database & db_,
1214 const string & term_,
1215 Xapian::termcount & sum_,
1216 Xapian::termcount & sum_squares_)
1217 : CheckStatsWeight(db_, term_, string(), sum_, sum_squares_) { }
1219 void init(double factor_) override {
1220 factor = factor_;
1223 Weight* clone() const override {
1224 auto res = new CheckStatsWeight(db, term1, term2, sum, sum_squares);
1225 if (term2 == "=") {
1226 // The object passed to Enquire::set_weighting_scheme() is cloned
1227 // right away, and then cloned again for each term, and then
1228 // potentially once more for the term-independent weight
1229 // contribution. In the repeated case, we want to handle the first
1230 // actual term specially, so we arrange for that to have "=" for
1231 // term2, and subsequent clones to have "_", so that we accumulate
1232 // sum and sum_squares on the first occurrence only.
1233 term2 = "_";
1235 return res;
1238 double get_sumpart(Xapian::termcount wdf,
1239 Xapian::termcount doclen,
1240 Xapian::termcount uniqueterms,
1241 Xapian::termcount wdfdocmax) const override {
1242 Xapian::doccount num_docs = db.get_doccount();
1243 TEST_EQUAL(get_collection_size(), num_docs);
1244 TEST_EQUAL(get_rset_size(), 0);
1245 TEST_EQUAL(get_average_length(), db.get_avlength());
1246 Xapian::totallength totlen = get_total_length();
1247 TEST_EQUAL(totlen, db.get_total_length());
1248 double total_term_occurences = get_average_length() * num_docs;
1249 TEST_EQUAL(Xapian::totallength(total_term_occurences + 0.5), totlen);
1250 if (term2.empty() || term2 == "=" || term2 == "_") {
1251 TEST_EQUAL(get_termfreq(), db.get_termfreq(term1));
1252 TEST_EQUAL(get_collection_freq(), db.get_collection_freq(term1));
1253 if (term2.empty()) {
1254 TEST_EQUAL(get_query_length(), 1);
1255 } else {
1256 TEST_EQUAL(get_query_length(), 2);
1258 } else {
1259 Xapian::doccount tfmax = 0, tfsum = 0;
1260 Xapian::termcount cfmax = 0, cfsum = 0;
1261 if (term2 == "*") {
1262 // OP_WILDCARD case.
1263 for (auto&& t = db.allterms_begin(term1);
1264 t != db.allterms_end(term1); ++t) {
1265 Xapian::doccount tf = t.get_termfreq();
1266 tout << "->" << *t << " " << tf << '\n';
1267 tfsum += tf;
1268 tfmax = max(tfmax, tf);
1269 Xapian::termcount cf = db.get_collection_freq(*t);
1270 cfsum += cf;
1271 cfmax = max(cfmax, cf);
1273 TEST_EQUAL(get_query_length(), 1);
1274 } else {
1275 // OP_SYNONYM case.
1276 Xapian::doccount tf1 = db.get_termfreq(term1);
1277 Xapian::doccount tf2 = db.get_termfreq(term2);
1278 tfsum = tf1 + tf2;
1279 tfmax = max(tf1, tf2);
1280 Xapian::termcount cf1 = db.get_collection_freq(term1);
1281 Xapian::termcount cf2 = db.get_collection_freq(term2);
1282 cfsum = cf1 + cf2;
1283 cfmax = max(cf1, cf2);
1284 TEST_EQUAL(get_query_length(), 2);
1286 // Synonym occurs at least as many times as any term.
1287 TEST_REL(get_termfreq(), >=, tfmax);
1288 TEST_REL(get_collection_freq(), >=, cfmax);
1289 // Synonym can't occur more times than the terms do.
1290 TEST_REL(get_termfreq(), <=, tfsum);
1291 TEST_REL(get_collection_freq(), <=, cfsum);
1292 // Synonym can't occur more times than there are documents/terms.
1293 TEST_REL(get_termfreq(), <=, num_docs);
1294 TEST_REL(get_collection_freq(), <=, totlen);
1296 TEST_EQUAL(get_reltermfreq(), 0);
1297 TEST_EQUAL(get_wqf(), 1);
1298 TEST_REL(doclen,>=,len_lower);
1299 TEST_REL(doclen,<=,len_upper);
1300 if (doclen > 0) {
1301 TEST_REL(uniqueterms,>=,1);
1302 TEST_REL(uniqueterms_lower,>=,1);
1303 TEST_REL(wdfdocmax,>=,1);
1305 TEST_REL(uniqueterms,>=,uniqueterms_lower);
1306 TEST_REL(uniqueterms,<=,uniqueterms_upper);
1307 TEST_REL(uniqueterms,<=,doclen);
1308 TEST_REL(uniqueterms_upper,<=,len_upper);
1309 TEST_REL(wdf,<=,wdf_upper);
1310 TEST_REL(wdfdocmax,<=,doclen);
1311 TEST_REL(wdfdocmax,>=,wdf);
1313 auto db_len_lower = db.get_doclength_lower_bound();
1314 auto db_len_upper = db.get_doclength_upper_bound();
1315 auto db_uniqueterms_lower = db.get_unique_terms_lower_bound();
1316 auto db_uniqueterms_upper = db.get_unique_terms_upper_bound();
1317 TEST_EQUAL(get_db_doclength_lower_bound(), db_len_lower);
1318 TEST_EQUAL(get_db_doclength_upper_bound(), db_len_upper);
1319 TEST_EQUAL(get_db_unique_terms_lower_bound(), db_uniqueterms_lower);
1320 TEST_EQUAL(get_db_unique_terms_upper_bound(), db_uniqueterms_upper);
1321 if (db.size() == 1) {
1322 TEST_EQUAL(len_lower, db_len_lower);
1323 TEST_EQUAL(len_upper, db_len_upper);
1324 TEST_EQUAL(uniqueterms_lower, db_uniqueterms_lower);
1325 TEST_EQUAL(uniqueterms_upper, db_uniqueterms_upper);
1326 } else {
1327 TEST_REL(len_lower,>=,db_len_lower);
1328 TEST_REL(len_upper,<=,db_len_upper);
1329 TEST_REL(uniqueterms_lower,>=,db_uniqueterms_lower);
1330 TEST_REL(uniqueterms_upper,<=,db_uniqueterms_upper);
1332 if (term2 != "_") {
1333 sum += wdf;
1334 sum_squares += wdf * wdf;
1336 return 1.0;
1339 double get_maxpart() const override {
1340 if (len_upper == 0) {
1341 len_lower = get_doclength_lower_bound();
1342 len_upper = get_doclength_upper_bound();
1343 uniqueterms_lower = get_unique_terms_lower_bound();
1344 uniqueterms_upper = get_unique_terms_upper_bound();
1345 wdf_upper = get_wdf_upper_bound();
1347 return 1.0;
1350 double get_sumextra(Xapian::termcount doclen,
1351 Xapian::termcount,
1352 Xapian::termcount) const override {
1353 return 1.0 / doclen;
1356 double get_maxextra() const override { return 1.0; }
1359 /// Check the weight subclass gets the correct stats.
1360 DEFINE_TESTCASE(checkstatsweight1, backend && !remote) {
1361 Xapian::Database db = get_database("apitest_simpledata");
1362 Xapian::Enquire enquire(db);
1363 Xapian::TermIterator a;
1364 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1365 const string & term = *a;
1366 enquire.set_query(Xapian::Query(term));
1367 Xapian::termcount sum = 0;
1368 Xapian::termcount sum_squares = 0;
1369 CheckStatsWeight wt(db, term, sum, sum_squares);
1370 enquire.set_weighting_scheme(wt);
1371 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1373 // The document order in the multi-db case isn't the same as the
1374 // postlist order on the combined DB, so it's hard to compare the
1375 // wdf for each document in the Weight objects, but we can sum
1376 // the wdfs and the squares of the wdfs which provides a decent
1377 // check that we're not getting the wrong wdf values (it ensures
1378 // they have the right mean and standard deviation).
1379 Xapian::termcount expected_sum = 0;
1380 Xapian::termcount expected_sum_squares = 0;
1381 Xapian::PostingIterator i;
1382 for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1383 Xapian::termcount wdf = i.get_wdf();
1384 expected_sum += wdf;
1385 expected_sum_squares += wdf * wdf;
1387 TEST_EQUAL(sum, expected_sum);
1388 TEST_EQUAL(sum_squares, expected_sum_squares);
1392 /// Check the weight subclass gets the correct stats with OP_SYNONYM.
1393 // Regression test for bugs fixed in 1.4.1.
1394 DEFINE_TESTCASE(checkstatsweight2, backend && !remote) {
1395 Xapian::Database db = get_database("apitest_simpledata");
1396 Xapian::Enquire enquire(db);
1397 Xapian::TermIterator a;
1398 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1399 const string & term1 = *a;
1400 if (++a == db.allterms_end()) break;
1401 const string & term2 = *a;
1402 Xapian::Query q(Xapian::Query::OP_SYNONYM,
1403 Xapian::Query(term1), Xapian::Query(term2));
1404 tout << q.get_description() << '\n';
1405 enquire.set_query(q);
1406 Xapian::termcount sum = 0;
1407 Xapian::termcount sum_squares = 0;
1408 CheckStatsWeight wt(db, term1, term2, sum, sum_squares);
1409 enquire.set_weighting_scheme(wt);
1410 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1412 // The document order in the multi-db case isn't the same as the
1413 // postlist order on the combined DB, so it's hard to compare the
1414 // wdf for each document in the Weight objects, but we can sum
1415 // the wdfs and the squares of the wdfs which provides a decent
1416 // check that we're not getting the wrong wdf values (it ensures
1417 // they have the right mean and standard deviation).
1418 Xapian::termcount expected_sum = 0;
1419 Xapian::termcount expected_sum_squares = 0;
1420 Xapian::PostingIterator i = db.postlist_begin(term1);
1421 Xapian::PostingIterator j = db.postlist_begin(term2);
1422 Xapian::docid did1 = *i, did2 = *j;
1423 while (true) {
1424 // To calculate expected_sum_squares correctly we need to square
1425 // the sum per document.
1426 Xapian::termcount wdf;
1427 if (did1 == did2) {
1428 wdf = i.get_wdf() + j.get_wdf();
1429 did1 = did2 = 0;
1430 } else if (did1 < did2) {
1431 wdf = i.get_wdf();
1432 did1 = 0;
1433 } else {
1434 wdf = j.get_wdf();
1435 did2 = 0;
1437 expected_sum += wdf;
1438 expected_sum_squares += wdf * wdf;
1440 if (did1 == 0) {
1441 if (++i != db.postlist_end(term1)) {
1442 did1 = *i;
1443 } else {
1444 if (did2 == Xapian::docid(-1)) break;
1445 did1 = Xapian::docid(-1);
1448 if (did2 == 0) {
1449 if (++j != db.postlist_end(term2)) {
1450 did2 = *j;
1451 } else {
1452 if (did1 == Xapian::docid(-1)) break;
1453 did2 = Xapian::docid(-1);
1457 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1458 // the individual terms.
1459 TEST_EQUAL(sum, expected_sum);
1460 TEST_REL(sum_squares, >=, expected_sum_squares);
1464 /// Check the weight subclass gets the correct stats with OP_WILDCARD.
1465 // Regression test for bug fixed in 1.4.1.
1466 DEFINE_TESTCASE(checkstatsweight3, backend && !remote) {
1467 // The most correct thing to do would be to collate termfreqs across shards
1468 // for this, but if that's too hard to do efficiently we could at least
1469 // scale up the termfreqs proportional to the size of the shard.
1470 XFAIL_FOR_BACKEND("multi", "OP_WILDCARD+OP_SYNONYM use shard termfreqs");
1472 struct PlCmp {
1473 bool operator()(const Xapian::PostingIterator& a,
1474 const Xapian::PostingIterator& b) {
1475 return *a < *b;
1479 Xapian::Database db = get_database("apitest_simpledata");
1480 Xapian::Enquire enquire(db);
1481 Xapian::TermIterator a;
1482 static const char * const testcases[] = {
1483 "a", // a* matches all documents, but no term matches all.
1484 "pa", // Expands to only "paragraph", matching 5.
1485 "zulu", // No matches.
1486 "th", // Term "this" matches all documents.
1488 for (auto pattern : testcases) {
1489 Xapian::Query q(Xapian::Query::OP_WILDCARD, pattern);
1490 tout << q.get_description() << '\n';
1491 enquire.set_query(q);
1492 Xapian::termcount sum = 0;
1493 Xapian::termcount sum_squares = 0;
1494 CheckStatsWeight wt(db, pattern, "*", sum, sum_squares);
1495 enquire.set_weighting_scheme(wt);
1496 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1498 // The document order in the multi-db case isn't the same as the
1499 // postlist order on the combined DB, so it's hard to compare the
1500 // wdf for each document in the Weight objects, but we can sum
1501 // the wdfs and the squares of the wdfs which provides a decent
1502 // check that we're not getting the wrong wdf values (it ensures
1503 // they have the right mean and standard deviation).
1504 Xapian::termcount expected_sum = 0;
1505 Xapian::termcount expected_sum_squares = 0;
1506 vector<Xapian::PostingIterator> postlists;
1507 for (auto&& t = db.allterms_begin(pattern);
1508 t != db.allterms_end(pattern); ++t) {
1509 postlists.emplace_back(db.postlist_begin(*t));
1511 Heap::make(postlists.begin(), postlists.end(), PlCmp());
1512 Xapian::docid did = 0;
1513 Xapian::termcount wdf = 0;
1514 while (!postlists.empty()) {
1515 Xapian::docid did_new = *postlists.front();
1516 Xapian::termcount wdf_new = postlists.front().get_wdf();
1517 if (++(postlists.front()) == Xapian::PostingIterator()) {
1518 Heap::pop(postlists.begin(), postlists.end(), PlCmp());
1519 postlists.pop_back();
1520 } else {
1521 Heap::replace(postlists.begin(), postlists.end(), PlCmp());
1523 if (did_new != did) {
1524 expected_sum += wdf;
1525 expected_sum_squares += wdf * wdf;
1526 wdf = 0;
1527 did = did_new;
1529 wdf += wdf_new;
1531 expected_sum += wdf;
1532 expected_sum_squares += wdf * wdf;
1533 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1534 // the individual terms.
1535 TEST_EQUAL(sum, expected_sum);
1536 TEST_REL(sum_squares, >=, expected_sum_squares);
1540 /// Check the stats for a repeated term are correct.
1541 // Regression test for bug fixed in 1.4.6. Doesn't work with
1542 // multi as the weight object is cloned more times.
1543 DEFINE_TESTCASE(checkstatsweight4, backend && !remote && !multi) {
1544 Xapian::Database db = get_database("apitest_simpledata");
1545 Xapian::Enquire enquire(db);
1546 Xapian::TermIterator a;
1547 for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1548 const string & term = *a;
1549 enquire.set_query(Xapian::Query(term, 1, 1) |
1550 Xapian::Query(term, 1, 2));
1551 Xapian::termcount sum = 0;
1552 Xapian::termcount sum_squares = 0;
1553 CheckStatsWeight wt(db, term, "=", sum, sum_squares);
1554 enquire.set_weighting_scheme(wt);
1555 Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1557 // The document order in the multi-db case isn't the same as the
1558 // postlist order on the combined DB, so it's hard to compare the
1559 // wdf for each document in the Weight objects, but we can sum
1560 // the wdfs and the squares of the wdfs which provides a decent
1561 // check that we're not getting the wrong wdf values (it ensures
1562 // they have the right mean and standard deviation).
1563 Xapian::termcount expected_sum = 0;
1564 Xapian::termcount expected_sum_squares = 0;
1565 Xapian::PostingIterator i;
1566 for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1567 Xapian::termcount wdf = i.get_wdf();
1568 expected_sum += wdf;
1569 expected_sum_squares += wdf * wdf;
1571 TEST_EQUAL(sum, expected_sum);
1572 TEST_EQUAL(sum_squares, expected_sum_squares);
1576 class CheckStatsWeight5 : public Xapian::Weight {
1577 public:
1578 mutable Xapian::docid did = 0;
1580 double factor;
1582 Xapian::Database db;
1584 char stat_code;
1586 explicit
1587 CheckStatsWeight5(const Xapian::Database& db_, char stat_code_ = '\0')
1588 : factor(-1.0), db(db_), stat_code(stat_code_)
1590 switch (stat_code) {
1591 case 'w':
1592 need_stat(WDF);
1593 break;
1594 case 'd':
1595 need_stat(DOC_LENGTH);
1596 break;
1598 need_stat(WDF_DOC_MAX);
1601 void init(double factor_) override {
1602 factor = factor_;
1605 Weight* clone() const override {
1606 return new CheckStatsWeight5(db, stat_code);
1609 double get_sumpart(Xapian::termcount,
1610 Xapian::termcount,
1611 Xapian::termcount,
1612 Xapian::termcount wdfdocmax) const override {
1613 // The query is a synonym of all terms, so should match all documents.
1614 ++did;
1615 TEST_REL(wdfdocmax,==,db.get_doclength(did));
1616 return 1.0 / wdfdocmax;
1619 double get_maxpart() const override {
1620 return 1.0;
1624 /// Check wdfdocmax is clamped to doclen even if wdf and doclen aren't wanted.
1625 DEFINE_TESTCASE(checkstatsweight5, backend && !multi && !remote) {
1626 Xapian::Database db = get_database("apitest_simpledata");
1627 Xapian::Enquire enquire(db);
1628 Xapian::Query q{Xapian::Query::OP_SYNONYM,
1629 db.allterms_begin(),
1630 db.allterms_end()};
1631 enquire.set_query(q);
1632 enquire.set_weighting_scheme(CheckStatsWeight5(db));
1633 Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
1634 enquire.set_weighting_scheme(CheckStatsWeight5(db, 'w'));
1635 Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
1636 enquire.set_weighting_scheme(CheckStatsWeight5(db, 'd'));
1637 Xapian::MSet mset3 = enquire.get_mset(0, db.get_doccount());
1640 // Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
1641 DEFINE_TESTCASE(unigramlmweight4, backend) {
1642 Xapian::Database db = get_database("apitest_simpledata");
1643 Xapian::Enquire enquire1(db);
1644 Xapian::Enquire enquire2(db);
1645 enquire1.set_query(Xapian::Query("paragraph"));
1646 Xapian::MSet mset1;
1647 enquire2.set_query(Xapian::Query("paragraph"));
1648 Xapian::MSet mset2;
1649 // 5 documents available with term paragraph so mset size should be 5
1650 enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING, 1, 0));
1651 enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 1, 0));
1652 mset1 = enquire1.get_mset(0, 10);
1653 mset2 = enquire2.get_mset(0, 10);
1655 TEST_EQUAL(mset1.size(), 5);
1656 TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight());
1659 /* Test for checking if we don't use smoothing all
1660 * of them should give same result i.e wdf_double/len_double */
1661 DEFINE_TESTCASE(unigramlmweight5, backend) {
1662 Xapian::Database db = get_database("apitest_simpledata");
1663 Xapian::Enquire enquire1(db);
1664 Xapian::Enquire enquire2(db);
1665 Xapian::Enquire enquire3(db);
1666 Xapian::Enquire enquire4(db);
1667 enquire1.set_query(Xapian::Query("paragraph"));
1668 Xapian::MSet mset1;
1669 enquire2.set_query(Xapian::Query("paragraph"));
1670 Xapian::MSet mset2;
1671 enquire3.set_query(Xapian::Query("paragraph"));
1672 Xapian::MSet mset3;
1673 enquire4.set_query(Xapian::Query("paragraph"));
1674 Xapian::MSet mset4;
1675 // 5 documents available with term paragraph so mset size should be 5
1676 enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0, 0));
1677 enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0, 0));
1678 enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, 0, 0));
1679 enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING, 0, 0));
1681 mset1 = enquire1.get_mset(0, 10);
1682 mset2 = enquire2.get_mset(0, 10);
1683 mset3 = enquire3.get_mset(0, 10);
1684 mset4 = enquire4.get_mset(0, 10);
1686 TEST_EQUAL(mset1.size(), 5);
1687 TEST_EQUAL(mset2.size(), 5);
1688 TEST_EQUAL(mset3.size(), 5);
1689 TEST_EQUAL(mset4.size(), 5);
1690 for (Xapian::doccount i = 0; i < 5; ++i) {
1691 TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight());
1692 TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight());
1693 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight());
1694 TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight());
1695 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight());
1696 TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight());
1700 // Feature test for Dir+ function.
1701 DEFINE_TESTCASE(unigramlmweight7, backend) {
1702 Xapian::Database db = get_database("apitest_simpledata");
1703 Xapian::Enquire enquire1(db);
1704 Xapian::Enquire enquire2(db);
1705 enquire1.set_query(Xapian::Query("paragraph"));
1706 enquire2.set_query(Xapian::Query("paragraph"));
1707 Xapian::MSet mset1;
1708 Xapian::MSet mset2;
1710 enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1711 enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING, 2000, 0.05));
1713 mset1 = enquire1.get_mset(0, 10);
1714 mset2 = enquire2.get_mset(0, 10);
1716 // mset size should be 5
1717 TEST_EQUAL(mset1.size(), 5);
1718 TEST_EQUAL(mset2.size(), 5);
1720 // Expect mset weights associated with Dir+ more than mset weights by Dir
1721 // because of the presence of extra weight component in Dir+ function.
1722 TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
1723 TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
1724 TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
1725 TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
1726 TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
1729 // Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
1730 DEFINE_TESTCASE(unigramlmweight8, backend) {
1731 Xapian::Database db = get_database("apitest_simpledata");
1732 Xapian::Enquire enquire(db);
1733 Xapian::Query query("paragraph");
1735 enquire.set_query(query);
1736 enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1738 Xapian::MSet mset1;
1739 mset1 = enquire.get_mset(0, 10);
1740 TEST_EQUAL(mset1.size(), 5);
1742 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
1743 enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1745 Xapian::MSet mset2;
1746 mset2 = enquire.get_mset(0, 10);
1747 TEST_EQUAL(mset2.size(), mset1.size());
1748 TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
1749 for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
1750 TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
1754 // Feature test for CoordWeight.
1755 DEFINE_TESTCASE(coordweight1, backend) {
1756 Xapian::Enquire enquire(get_database("apitest_simpledata"));
1757 enquire.set_weighting_scheme(Xapian::CoordWeight());
1758 static const char * const terms[] = {
1759 "this", "line", "paragraph", "rubbish"
1761 Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1762 enquire.set_query(query);
1763 Xapian::MSet mymset1 = enquire.get_mset(0, 100);
1764 // CoordWeight scores 1 for each matching term, so the weight should equal
1765 // the number of matching terms.
1766 for (Xapian::MSetIterator i = mymset1.begin(); i != mymset1.end(); ++i) {
1767 Xapian::termcount matching_terms = 0;
1768 Xapian::TermIterator t = enquire.get_matching_terms_begin(i);
1769 while (t != enquire.get_matching_terms_end(i)) {
1770 ++matching_terms;
1771 ++t;
1773 TEST_EQUAL(i.get_weight(), matching_terms);
1777 // Feature test.
1778 DEFINE_TESTCASE(dicecoeffweight2, backend) {
1779 Xapian::Database db = get_database("apitest_simpledata3");
1780 Xapian::Enquire enquire(db);
1781 static const char * const terms[] = {
1782 "one", "three"
1784 Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1785 enquire.set_query(query);
1786 enquire.set_weighting_scheme(Xapian::DiceCoeffWeight());
1788 Xapian::MSet mset1;
1789 mset1 = enquire.get_mset(0, 10);
1790 TEST_EQUAL(mset1.size(), 4);
1792 /* The weight value has been manually calculated by using the statistics
1793 * of the test database. */
1794 TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.571428571428571);
1795 TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.5);
1796 TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.2);
1797 TEST_EQUAL_DOUBLE(mset1[3].get_weight(), 0.181818181818182);
1800 // Test handling of a term with zero wdf.
1801 DEFINE_TESTCASE(dicecoeffweight3, backend) {
1802 Xapian::Database db = get_database("dicecoeffweight3",
1803 [](Xapian::WritableDatabase& wdb,
1804 const string&) {
1805 Xapian::Document doc;
1806 doc.add_term("radio", 2);
1807 doc.add_term("seahorse");
1808 doc.add_term("zebra");
1809 doc.add_boolean_term("false");
1810 doc.add_boolean_term("true");
1811 wdb.add_document(doc);
1813 Xapian::Enquire enquire(db);
1814 enquire.set_weighting_scheme(Xapian::DiceCoeffWeight());
1816 // OP_SYNONYM gives wdf zero is need_stat(WDF) isn't specified (and
1817 // it isn't by DiceCoeffWeight).
1818 Xapian::Query q(Xapian::Query::OP_SYNONYM,
1819 Xapian::Query("false"), Xapian::Query("true"));
1820 enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT,
1821 q, 6.0), 2);
1822 Xapian::MSet mset = enquire.get_mset(0, 10);
1823 TEST_EQUAL(mset.size(), 1);
1825 // factor * 2.0 * wqf / (query_length + unique_term_count)
1826 // = 6.0 * 2.0 * 1 / (2 + 4) = 2.0
1827 TEST_EQUAL_DOUBLE(mset[0].get_weight(), 2.0);