2 * @brief tests of Xapian::Weight subclasses
4 /* Copyright (C) 2004-2024 Olly Betts
5 * Copyright (C) 2013 Aarsh Shah
6 * Copyright (C) 2016 Vivek Pal
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "api_weight.h"
33 #include "testutils.h"
39 test_weight_class_no_params(const char* name
)
43 // Check name() returns the class name.
44 TEST_EQUAL(obj
.name(), name
);
45 // If there are no parameters, there's nothing to serialise.
46 string obj_serialised
= obj
.serialise();
47 TEST_EQUAL(obj_serialised
.size(), 0);
48 // Check serialising and unserialising gives object with same serialisation.
49 unique_ptr
<Xapian::Weight
> wt(W().unserialise(obj_serialised
));
50 TEST_EQUAL(obj_serialised
, wt
->serialise());
51 // Check that unserialise() throws suitable error for bad serialisation.
52 // The easy case to test is extra junk after the serialised weight.
54 unique_ptr
<Xapian::Weight
> bad(W().unserialise(obj_serialised
+ "X"));
55 FAIL_TEST(name
<< " did not throw for unserialise with junk appended");
56 } catch (const Xapian::SerialisationError
& e
) {
57 // Check the exception message contains the weighting scheme name
58 // (regression test for TradWeight's exception saying "BM25").
59 string target
= name
+ CONST_STRLEN("Xapian::");
60 TEST(e
.get_msg().find(target
) != string::npos
);
64 #define TEST_WEIGHT_CLASS_NO_PARAMS(W) test_weight_class_no_params<W>(#W)
68 test_weight_class(const char* name
, const W
& obj_default
, const W
& obj_other
)
72 // Check name() returns the class name.
73 TEST_EQUAL(obj
.name(), name
);
74 TEST_EQUAL(obj_default
.name(), name
);
75 TEST_EQUAL(obj_other
.name(), name
);
76 // Check serialisation matches that of object constructed with explicit
77 // parameter values of what the defaults are meant to be.
78 string obj_serialised
= obj
.serialise();
79 TEST_EQUAL(obj_serialised
, obj_default
.serialise());
80 // Check serialisation is different to object with different parameters.
81 string obj_other_serialised
= obj_other
.serialise();
82 TEST_NOT_EQUAL(obj_serialised
, obj_other_serialised
);
83 // Check serialising and unserialising gives object with same serialisation.
84 unique_ptr
<Xapian::Weight
> wt(W().unserialise(obj_serialised
));
85 TEST_EQUAL(obj_serialised
, wt
->serialise());
86 // Check serialising and unserialising of object with different parameters.
87 unique_ptr
<Xapian::Weight
> wt2(W().unserialise(obj_other_serialised
));
88 TEST_EQUAL(obj_other_serialised
, wt2
->serialise());
89 // Check that unserialise() throws suitable error for bad serialisation.
90 // The easy case to test is extra junk after the serialised weight.
92 unique_ptr
<Xapian::Weight
> bad(W().unserialise(obj_serialised
+ "X"));
93 FAIL_TEST(name
<< " did not throw for unserialise with junk appended");
94 } catch (const Xapian::SerialisationError
& e
) {
95 // Check the exception message contains the weighting scheme name
96 // (regression test for TradWeight's exception saying "BM25").
97 string target
= name
+ CONST_STRLEN("Xapian::");
98 TEST(e
.get_msg().find(target
) != string::npos
);
102 // W Should be the class name.
104 // DEFAULT should be a parenthesised parameter list to explicitly construct
105 // an object of class W with the documented default parameters.
107 // OTHER should be a parenthesised parameter list to construct an object with
108 // non-default parameters.
109 #define TEST_WEIGHT_CLASS(W, DEFAULT, OTHER) \
110 test_weight_class<W>(#W, W DEFAULT, W OTHER)
112 /// Test serialisation and introspection of built-in weighting schemes.
113 DEFINE_TESTCASE(weightserialisation1
, !backend
) {
114 // Parameter-free weighting schemes.
115 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::BoolWeight
);
116 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::CoordWeight
);
117 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DLHWeight
);
118 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DPHWeight
);
119 TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DiceCoeffWeight
);
121 // Parameterised weighting schemes.
122 TEST_WEIGHT_CLASS(Xapian::TradWeight
, (1.0), (2.0));
123 TEST_WEIGHT_CLASS(Xapian::BM25Weight
,
125 (1, 0.5, 1, 0.5, 0.5));
126 TEST_WEIGHT_CLASS(Xapian::BM25PlusWeight
,
127 (1, 0, 1, 0.5, 0.5, 1.0),
128 (1, 0, 1, 0.5, 0.5, 2.0));
129 TEST_WEIGHT_CLASS(Xapian::TfIdfWeight
, ("ntn"), ("bpn"));
130 TEST_WEIGHT_CLASS(Xapian::InL2Weight
, (1.0), (2.0));
131 TEST_WEIGHT_CLASS(Xapian::IfB2Weight
, (1.0), (2.0));
132 TEST_WEIGHT_CLASS(Xapian::IneB2Weight
, (1.0), (2.0));
133 TEST_WEIGHT_CLASS(Xapian::BB2Weight
, (1.0), (2.0));
134 TEST_WEIGHT_CLASS(Xapian::PL2Weight
, (1.0), (2.0));
135 TEST_WEIGHT_CLASS(Xapian::PL2PlusWeight
,
138 TEST_WEIGHT_CLASS(Xapian::LMWeight
,
139 (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING
, 0.7, 2000.0),
140 (0.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING
, 0.7));
143 /// Basic test of using weighting schemes.
144 DEFINE_TESTCASE(weight1
, backend
) {
145 Xapian::Database
db(get_database("etext"));
146 Xapian::Enquire
enquire(db
);
147 Xapian::Enquire
enquire_scaled(db
);
148 auto term
= "robinson";
149 Xapian::Query q
{term
};
150 enquire
.set_query(q
);
151 enquire_scaled
.set_query(q
* 15.0);
152 auto expected_matches
= db
.get_termfreq(term
);
153 auto helper
= [&](const Xapian::Weight
& weight
,
155 string_view params
) {
156 tout
<< name
<< '(' << params
<< ")\n";
157 enquire
.set_weighting_scheme(weight
);
158 enquire_scaled
.set_weighting_scheme(weight
);
159 Xapian::MSet mset
= enquire
.get_mset(0, expected_matches
+ 1);
160 TEST_EQUAL(mset
.size(), expected_matches
);
161 if (name
== "Xapian::BoolWeight") {
162 /* All weights should be zero. */
163 TEST_EQUAL(mset
[0].get_weight(), 0.0);
164 TEST_EQUAL(mset
.back().get_weight(), 0.0);
165 } else if (name
== "Xapian::CoordWeight") {
166 /* All weights should be 1 for a single term query. */
167 TEST_EQUAL(mset
[0].get_weight(), 1.0);
168 TEST_EQUAL(mset
.back().get_weight(), 1.0);
169 } else if (!params
.empty()) {
170 /* All weights should be equal with these particular parameters. */
171 TEST_NOT_EQUAL(mset
[0].get_weight(), 0.0);
172 TEST_EQUAL(mset
[0].get_weight(), mset
.back().get_weight());
174 TEST_NOT_EQUAL(mset
[0].get_weight(), 0.0);
175 TEST_NOT_EQUAL(mset
[0].get_weight(), mset
.back().get_weight());
177 Xapian::MSet mset_scaled
= enquire_scaled
.get_mset(0, expected_matches
);
178 TEST_EQUAL(mset_scaled
.size(), expected_matches
);
179 for (Xapian::doccount i
= 0; i
< expected_matches
; ++i
) {
180 TEST_EQUAL_DOUBLE(mset_scaled
[i
].get_weight(),
181 mset
[i
].get_weight() * 15.0);
184 #define TEST_WEIGHTING_SCHEME(W, ...) helper(W(__VA_ARGS__), #W, #__VA_ARGS__)
185 TEST_WEIGHTING_SCHEME(Xapian::BoolWeight
);
186 TEST_WEIGHTING_SCHEME(Xapian::CoordWeight
);
187 TEST_WEIGHTING_SCHEME(Xapian::DLHWeight
);
188 TEST_WEIGHTING_SCHEME(Xapian::DPHWeight
);
189 TEST_WEIGHTING_SCHEME(Xapian::DiceCoeffWeight
);
190 TEST_WEIGHTING_SCHEME(Xapian::TradWeight
);
191 TEST_WEIGHTING_SCHEME(Xapian::BM25Weight
);
192 TEST_WEIGHTING_SCHEME(Xapian::BM25PlusWeight
);
193 TEST_WEIGHTING_SCHEME(Xapian::TfIdfWeight
);
194 TEST_WEIGHTING_SCHEME(Xapian::InL2Weight
);
195 TEST_WEIGHTING_SCHEME(Xapian::IfB2Weight
);
196 TEST_WEIGHTING_SCHEME(Xapian::IneB2Weight
);
197 TEST_WEIGHTING_SCHEME(Xapian::BB2Weight
);
198 TEST_WEIGHTING_SCHEME(Xapian::PL2Weight
);
199 TEST_WEIGHTING_SCHEME(Xapian::PL2PlusWeight
);
200 TEST_WEIGHTING_SCHEME(Xapian::LMWeight
);
201 // Regression test for bug fixed in 1.2.4.
202 TEST_WEIGHTING_SCHEME(Xapian::BM25Weight
, 0, 0, 0, 0, 1);
203 /* As mentioned in the documentation, when parameter k is 0, wdf and
204 * document length don't affect the weights. Regression test for bug fixed
207 TEST_WEIGHTING_SCHEME(Xapian::TradWeight
, 0);
208 #undef TEST_WEIGHTING_SCHEME
211 /** Regression test for bug fixed in 1.0.5.
213 * This test would fail under valgrind because it used an uninitialised value.
215 DEFINE_TESTCASE(bm25weight1
, backend
) {
216 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
217 enquire
.set_weighting_scheme(Xapian::BM25Weight(1, 25, 1, 0.01, 0.5));
218 enquire
.set_query(Xapian::Query("word"));
220 Xapian::MSet mset
= enquire
.get_mset(0, 25);
223 // Test parameter combinations which should be unaffected by doclength.
224 DEFINE_TESTCASE(bm25weight4
, backend
) {
225 Xapian::Database db
= get_database("apitest_simpledata");
226 Xapian::Enquire
enquire(db
);
227 enquire
.set_query(Xapian::Query("paragraph"));
230 enquire
.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
231 mset
= enquire
.get_mset(0, 10);
232 TEST_EQUAL(mset
.size(), 5);
233 // Expect: wdf has an effect on weight, but doclen doesn't.
234 TEST_REL(mset
[0].get_weight(),>,mset
[1].get_weight());
235 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), mset
[2].get_weight());
236 TEST_REL(mset
[2].get_weight(),>,mset
[3].get_weight());
237 TEST_EQUAL_DOUBLE(mset
[3].get_weight(), mset
[4].get_weight());
239 enquire
.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
240 mset
= enquire
.get_mset(0, 10);
241 TEST_EQUAL(mset
.size(), 5);
242 // Expect: neither wdf nor doclen affects weight.
243 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), mset
[4].get_weight());
246 /// Test non-zero k2 with zero k1.
247 // Regression test for bug fixed in 1.2.17 and 1.3.2.
248 DEFINE_TESTCASE(bm25weight5
, backend
) {
249 Xapian::Database db
= get_database("apitest_simpledata");
250 Xapian::Enquire
enquire(db
);
251 enquire
.set_query(Xapian::Query("paragraph"));
254 enquire
.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
255 mset
= enquire
.get_mset(0, 10);
256 TEST_EQUAL(mset
.size(), 5);
257 // Expect: wdf has no effect on weight; shorter docs rank higher.
258 mset_expect_order(mset
, 3, 5, 1, 4, 2);
259 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), mset
[1].get_weight());
260 TEST_REL(mset
[1].get_weight(),>,mset
[2].get_weight());
261 TEST_REL(mset
[2].get_weight(),>,mset
[3].get_weight());
262 TEST_REL(mset
[3].get_weight(),>,mset
[4].get_weight());
265 // Test parameter combinations which should be unaffected by doclength.
266 DEFINE_TESTCASE(bm25plusweight2
, backend
) {
267 Xapian::Database db
= get_database("apitest_simpledata");
268 Xapian::Enquire
enquire(db
);
269 enquire
.set_query(Xapian::Query("paragraph"));
272 enquire
.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
273 mset
= enquire
.get_mset(0, 10);
274 TEST_EQUAL(mset
.size(), 5);
275 // Expect: wdf has an effect on weight, but doclen doesn't.
276 TEST_REL(mset
[0].get_weight(),>,mset
[1].get_weight());
277 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), mset
[2].get_weight());
278 TEST_REL(mset
[2].get_weight(),>,mset
[3].get_weight());
279 TEST_EQUAL_DOUBLE(mset
[3].get_weight(), mset
[4].get_weight());
281 enquire
.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
282 mset
= enquire
.get_mset(0, 10);
283 TEST_EQUAL(mset
.size(), 5);
284 // Expect: neither wdf nor doclen affects weight.
285 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), mset
[4].get_weight());
288 // Regression test for a mistake corrected in the BM25+ implementation.
289 DEFINE_TESTCASE(bm25plusweight3
, backend
) {
290 Xapian::Database db
= get_database("apitest_simpledata");
291 Xapian::Enquire
enquire(db
);
292 enquire
.set_query(Xapian::Query("paragraph"));
295 enquire
.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
296 mset
= enquire
.get_mset(0, 10);
297 TEST_EQUAL(mset
.size(), 5);
299 // The value of each doc weight calculated manually from the BM25+ formulae
300 // by using the respective document statistics.
301 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 0.7920796567487473);
302 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 0.7846980783848447);
303 TEST_EQUAL_DOUBLE(mset
[2].get_weight(), 0.7558817623365934);
304 TEST_EQUAL_DOUBLE(mset
[3].get_weight(), 0.7210119356168847);
305 TEST_EQUAL_DOUBLE(mset
[4].get_weight(), 0.7210119356168847);
309 // Test for invalid values of c.
310 DEFINE_TESTCASE(inl2weight2
, !backend
) {
311 // InvalidArgumentError should be thrown if the parameter c is invalid.
312 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
313 Xapian::InL2Weight
wt(-2.0));
315 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
316 Xapian::InL2Weight
wt2(0.0));
319 // Feature tests for Inl2Weight
320 DEFINE_TESTCASE(inl2weight3
, backend
) {
321 Xapian::Database db
= get_database("apitest_simpledata");
322 Xapian::Enquire
enquire(db
);
323 Xapian::Query
query("banana");
325 enquire
.set_query(query
);
326 enquire
.set_weighting_scheme(Xapian::InL2Weight(2.0));
329 mset1
= enquire
.get_mset(0, 10);
330 TEST_EQUAL(mset1
.size(), 1);
331 mset_expect_order(mset1
, 6);
333 /* The value has been calculated in the python interpreter by looking at the
334 * database statistics. */
335 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 1.559711143842063);
338 // Test for invalid values of c.
339 DEFINE_TESTCASE(ifb2weight2
, !backend
) {
340 // InvalidArgumentError should be thrown if the parameter c is invalid.
341 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
342 Xapian::IfB2Weight
wt(-2.0));
344 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
345 Xapian::IfB2Weight
wt2(0.0));
349 DEFINE_TESTCASE(ifb2weight3
, backend
) {
350 Xapian::Database db
= get_database("apitest_simpledata");
351 Xapian::Enquire
enquire(db
);
352 Xapian::Query
query("banana");
354 enquire
.set_query(query
);
355 enquire
.set_weighting_scheme(Xapian::IfB2Weight(2.0));
358 mset1
= enquire
.get_mset(0, 10);
359 TEST_EQUAL(mset1
.size(), 1);
361 /* The value of the weight has been manually calculated using the statistics
362 * of the test database. */
363 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 3.119422287684126);
366 // Test for invalid values of c.
367 DEFINE_TESTCASE(ineb2weight2
, !backend
) {
368 // InvalidArgumentError should be thrown if parameter c is invalid.
369 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
370 Xapian::IneB2Weight
wt(-2.0));
372 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
373 Xapian::IneB2Weight
wt2(0.0));
377 DEFINE_TESTCASE(ineb2weight3
, backend
) {
378 Xapian::Database db
= get_database("apitest_simpledata");
379 Xapian::Enquire
enquire(db
);
380 Xapian::Query
query("paragraph");
381 enquire
.set_query(query
);
382 enquire
.set_weighting_scheme(Xapian::IneB2Weight(2.0));
385 mset1
= enquire
.get_mset(0, 10);
386 TEST_EQUAL(mset1
.size(), 5);
388 // The third document in the database is 4th in the ranking.
389 /* The weight value has been manually calculated by using the statistics
390 * of the test database. */
391 TEST_EQUAL_DOUBLE(mset1
[4].get_weight(), 0.61709730297692400036);
394 // Test for invalid values of c.
395 DEFINE_TESTCASE(bb2weight2
, !backend
) {
396 // InvalidArgumentError should be thrown if the parameter c is invalid.
397 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
398 Xapian::BB2Weight
wt(-2.0));
400 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
401 Xapian::BB2Weight
wt2(0.0));
405 DEFINE_TESTCASE(bb2weight3
, backend
) {
406 Xapian::Database db
= get_database("apitest_simpledata");
407 Xapian::Enquire
enquire(db
);
408 Xapian::Query
query("paragraph");
410 enquire
.set_query(query
);
411 enquire
.set_weighting_scheme(Xapian::BB2Weight(2.0));
414 mset1
= enquire
.get_mset(0, 10);
415 TEST_EQUAL(mset1
.size(), 5);
416 /* The third document in the database has the highest weight and is the
417 * first in the mset. */
418 // Value calculated manually by using the statistics of the test database.
419 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 1.6823696969784483);
421 // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
422 // were applying the factor to the upper bound twice).
423 enquire
.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT
, query
, 1.0 / 1024));
424 enquire
.set_weighting_scheme(Xapian::BB2Weight(2.0));
427 mset3
= enquire
.get_mset(0, 10);
428 TEST_EQUAL(mset3
.size(), 5);
430 for (int i
= 0; i
< 5; ++i
) {
431 TEST_EQUAL_DOUBLE(mset1
[i
].get_weight(), mset3
[i
].get_weight() * 1024);
435 // Regression test: we used to calculate log2(0) when there was only one doc.
436 DEFINE_TESTCASE(bb2weight4
, backend
) {
437 Xapian::Database db
= get_database("apitest_onedoc");
438 Xapian::Enquire
enquire(db
);
439 Xapian::Query
query("word");
441 enquire
.set_query(query
);
442 enquire
.set_weighting_scheme(Xapian::BB2Weight());
445 mset1
= enquire
.get_mset(0, 10);
446 TEST_EQUAL(mset1
.size(), 1);
447 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 3.431020621347435);
451 DEFINE_TESTCASE(dlhweight1
, backend
) {
452 Xapian::Database db
= get_database("apitest_simpledata");
453 Xapian::Enquire
enquire(db
);
454 Xapian::Query
query("a");
456 enquire
.set_query(query
);
457 enquire
.set_weighting_scheme(Xapian::DLHWeight());
460 mset1
= enquire
.get_mset(0, 10);
461 TEST_EQUAL(mset1
.size(), 3);
462 mset_expect_order(mset1
, 3, 1, 2);
463 // Weights calculated manually using stats from the database.
464 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 1.0046477754371292362);
465 TEST_EQUAL_DOUBLE(mset1
[1].get_weight(), 0.97621929514640352757);
466 // The following weight would be negative but gets clamped to 0.
467 TEST_EQUAL_DOUBLE(mset1
[2].get_weight(), 0.0);
471 gen_wdf_eq_doclen_db(Xapian::WritableDatabase
& db
, const string
&)
473 Xapian::Document doc
;
474 doc
.add_term("solo", 37);
475 db
.add_document(doc
);
478 // Test wdf == doclen.
479 DEFINE_TESTCASE(dlhweight3
, backend
) {
480 Xapian::Database db
= get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db
);
481 Xapian::Enquire
enquire(db
);
482 Xapian::Query
query("solo");
484 enquire
.set_query(query
);
485 enquire
.set_weighting_scheme(Xapian::DLHWeight());
488 mset1
= enquire
.get_mset(0, 10);
489 TEST_EQUAL(mset1
.size(), 1);
490 // Weight gets clamped to zero.
491 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 0.0);
494 // Test for invalid values of c.
495 DEFINE_TESTCASE(pl2weight2
, !backend
) {
496 // InvalidArgumentError should be thrown if parameter c is invalid.
497 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
498 Xapian::PL2Weight
wt(-2.0));
502 DEFINE_TESTCASE(pl2weight3
, backend
) {
503 Xapian::Database db
= get_database("apitest_simpledata");
504 Xapian::Enquire
enquire(db
);
505 Xapian::Query
query("paragraph");
506 enquire
.set_query(query
);
509 enquire
.set_weighting_scheme(Xapian::PL2Weight(2.0));
510 mset
= enquire
.get_mset(0, 10);
511 TEST_EQUAL(mset
.size(), 5);
512 // Expected weight difference calculated in extended precision using stats
513 // from the test database.
514 TEST_EQUAL_DOUBLE(mset
[2].get_weight(),
515 mset
[3].get_weight() + 0.0086861771701328694);
518 // Test for invalid values of parameters, c and delta.
519 DEFINE_TESTCASE(pl2plusweight2
, !backend
) {
520 // InvalidArgumentError should be thrown if parameter c is invalid.
521 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
522 Xapian::PL2PlusWeight
wt(-2.0, 0.9));
524 // InvalidArgumentError should be thrown if parameter delta is invalid.
525 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
526 Xapian::PL2PlusWeight
wt(1.0, -1.9));
529 // Feature Test 1 for PL2PlusWeight.
530 DEFINE_TESTCASE(pl2plusweight4
, backend
) {
531 Xapian::Database db
= get_database("apitest_simpledata");
532 Xapian::Enquire
enquire(db
);
533 enquire
.set_query(Xapian::Query("to"));
536 enquire
.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
537 mset
= enquire
.get_mset(0, 10);
538 TEST_EQUAL(mset
.size(), 3);
539 // Expected weight difference calculated in Python using stats from the
541 TEST_EQUAL_DOUBLE(mset
[1].get_weight(),
542 mset
[2].get_weight() + 0.016760925252262027);
545 // Feature Test 2 for PL2PlusWeight
546 DEFINE_TESTCASE(pl2plusweight5
, backend
) {
547 Xapian::Database db
= get_database("apitest_simpledata");
548 Xapian::Enquire
enquire(db
);
549 Xapian::Query
query("word");
550 enquire
.set_query(query
);
553 enquire
.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
554 mset
= enquire
.get_mset(0, 10);
555 // Expect MSet contains two documents having query "word".
556 TEST_EQUAL(mset
.size(), 2);
557 // Expect Document 2 has higher weight than document 4 because
558 // "word" appears more no. of times in document 2 than document 4.
559 mset_expect_order(mset
, 2, 4);
563 DEFINE_TESTCASE(dphweight1
, backend
) {
564 Xapian::Database db
= get_database("apitest_simpledata");
565 Xapian::Enquire
enquire(db
);
566 Xapian::Query
query("paragraph");
568 enquire
.set_query(query
);
569 enquire
.set_weighting_scheme(Xapian::DPHWeight());
572 mset1
= enquire
.get_mset(0, 10);
573 TEST_EQUAL(mset1
.size(), 5);
574 /* The weight has been calculated manually by using the statistics of the
576 TEST_EQUAL_DOUBLE(mset1
[2].get_weight() - mset1
[4].get_weight(), 0.542623617687990167);
579 // Test wdf == doclen.
580 DEFINE_TESTCASE(dphweight3
, backend
) {
581 Xapian::Database db
= get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db
);
582 Xapian::Enquire
enquire(db
);
583 Xapian::Query
query("solo");
585 enquire
.set_query(query
);
586 enquire
.set_weighting_scheme(Xapian::DPHWeight());
589 mset1
= enquire
.get_mset(0, 10);
590 TEST_EQUAL(mset1
.size(), 1);
591 // Weight gets clamped to zero.
592 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 0.0);
595 // Test for various cases of normalization string.
596 DEFINE_TESTCASE(tfidfweight1
, !backend
) {
597 // InvalidArgumentError should be thrown if normalization string is invalid
598 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
599 Xapian::TfIdfWeight
b("JOHN_LENNON"));
601 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
602 Xapian::TfIdfWeight
b("LOL"));
604 /* Normalization string should be set to "ntn" by constructor if none is
606 Xapian::TfIdfWeight weight2
;
607 TEST_EQUAL(weight2
.serialise(), Xapian::TfIdfWeight("ntn").serialise());
609 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
610 Xapian::Weight::create("tfidf FUN NONE NONE"));
612 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
613 Xapian::Weight::create("tfidf NONE FUN NONE"));
615 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
616 Xapian::Weight::create("tfidf NONE NONE FUN"));
618 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
619 Xapian::Weight::create("tfidf NONE"));
621 TEST_EXCEPTION(Xapian::InvalidArgumentError
,
622 Xapian::Weight::create("tfidf NONE NONE"));
625 // Feature tests for various normalization functions.
626 DEFINE_TESTCASE(tfidfweight3
, backend
) {
627 Xapian::Database db
= get_database("apitest_simpledata");
628 Xapian::Enquire
enquire(db
);
629 Xapian::Query
query("word");
632 // Check for "ntn" when termfreq != N
633 enquire
.set_query(query
);
634 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
635 mset
= enquire
.get_mset(0, 10);
636 TEST_EQUAL(mset
.size(), 2);
637 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
638 mset_expect_order(mset
, 2, 4);
639 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 * log(6.0 / 2));
641 // Check that wqf is taken into account.
642 enquire
.set_query(Xapian::Query("word", 2));
643 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
644 Xapian::MSet mset2
= enquire
.get_mset(0, 10);
645 TEST_EQUAL(mset2
.size(), 2);
646 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
647 mset_expect_order(mset2
, 2, 4);
648 // wqf is 2, so weights should be doubled.
649 TEST_EQUAL_DOUBLE(mset
[0].get_weight() * 2, mset2
[0].get_weight());
650 TEST_EQUAL_DOUBLE(mset
[1].get_weight() * 2, mset2
[1].get_weight());
652 // check for "nfn" when termfreq != N
653 enquire
.set_query(query
);
654 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("nfn"));
655 mset
= enquire
.get_mset(0, 10);
656 TEST_EQUAL(mset
.size(), 2);
657 mset_expect_order(mset
, 2, 4);
658 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 / 2);
660 // check for "nsn" when termfreq != N
661 enquire
.set_query(query
);
662 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("nsn"));
663 mset
= enquire
.get_mset(0, 10);
664 TEST_EQUAL(mset
.size(), 2);
665 mset_expect_order(mset
, 2, 4);
666 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
668 // Check for "bnn" and for both branches of 'b'.
669 enquire
.set_query(Xapian::Query("test"));
670 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("bnn"));
671 mset
= enquire
.get_mset(0, 10);
672 TEST_EQUAL(mset
.size(), 1);
673 mset_expect_order(mset
, 1);
674 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 1.0);
676 // Check for "lnn" and for both branches of 'l'.
677 enquire
.set_query(Xapian::Query("word"));
678 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("lnn"));
679 mset
= enquire
.get_mset(0, 10);
680 TEST_EQUAL(mset
.size(), 2);
681 mset_expect_order(mset
, 2, 4);
682 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
683 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1.0); // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
686 enquire
.set_query(Xapian::Query("paragraph"));
687 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
688 mset
= enquire
.get_mset(0, 10);
689 TEST_EQUAL(mset
.size(), 5);
690 mset_expect_order(mset
, 2, 1, 4, 3, 5);
691 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 9.0);
692 TEST_EQUAL_DOUBLE(mset
[4].get_weight(), 1.0);
694 // Check for "ntn" when termfreq=N
695 enquire
.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "t"
696 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
697 mset
= enquire
.get_mset(0, 10);
698 TEST_EQUAL(mset
.size(), 6);
699 mset_expect_order(mset
, 1, 2, 3, 4, 5, 6);
700 for (int i
= 0; i
< 6; ++i
) {
701 TEST_EQUAL_DOUBLE(mset
[i
].get_weight(), 0.0);
704 // Check for "npn" and for both branches of 'p'
705 enquire
.set_query(Xapian::Query("this")); // N=termfreq and so idfn=0 for "p"
706 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
707 mset
= enquire
.get_mset(0, 10);
708 TEST_EQUAL(mset
.size(), 6);
709 mset_expect_order(mset
, 1, 2, 3, 4, 5, 6);
710 for (int i
= 0; i
< 6; ++i
) {
711 TEST_EQUAL_DOUBLE(mset
[i
].get_weight(), 0.0);
715 enquire
.set_query(Xapian::Query("word"));
716 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("Lnn"));
717 mset
= enquire
.get_mset(0, 10);
718 TEST_EQUAL(mset
.size(), 2);
719 mset_expect_order(mset
, 2, 4);
720 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
721 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
723 enquire
.set_query(Xapian::Query("word"));
724 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
725 mset
= enquire
.get_mset(0, 10);
726 TEST_EQUAL(mset
.size(), 2);
727 mset_expect_order(mset
, 2, 4);
728 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * log((6.0 - 2) / 2));
729 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * log((6.0 - 2) / 2));
732 enquire
.set_query(Xapian::Query("word"));
733 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("mnn"));
734 mset
= enquire
.get_mset(0, 10);
735 TEST_EQUAL(mset
.size(), 2);
736 mset_expect_order(mset
, 2, 4);
737 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 / 8);
738 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1.0 / 4);
741 enquire
.set_query(Xapian::Query("word"));
742 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("ann"));
743 mset
= enquire
.get_mset(0, 10);
744 TEST_EQUAL(mset
.size(), 2);
745 mset_expect_order(mset
, 2, 4);
746 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
747 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
749 // Check for NONE, TFIDF, NONE when termfreq != N
750 enquire
.set_query(query
);
751 enquire
.set_weighting_scheme(
753 Xapian::TfIdfWeight::wdf_norm::NONE
,
754 Xapian::TfIdfWeight::idf_norm::TFIDF
,
755 Xapian::TfIdfWeight::wt_norm::NONE
));
756 mset
= enquire
.get_mset(0, 10);
757 TEST_EQUAL(mset
.size(), 2);
758 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
759 mset_expect_order(mset
, 2, 4);
760 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 * log(6.0 / 2));
762 // Check that wqf is taken into account.
763 enquire
.set_query(Xapian::Query("word", 2));
764 mset2
= enquire
.get_mset(0, 10);
765 TEST_EQUAL(mset2
.size(), 2);
766 // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
767 mset_expect_order(mset2
, 2, 4);
768 // wqf is 2, so weights should be doubled.
769 TEST_EQUAL_DOUBLE(mset
[0].get_weight() * 2, mset2
[0].get_weight());
770 TEST_EQUAL_DOUBLE(mset
[1].get_weight() * 2, mset2
[1].get_weight());
772 // check for NONE, FREQ, NONE when termfreq != N
773 enquire
.set_query(query
);
774 enquire
.set_weighting_scheme(
776 Xapian::TfIdfWeight::wdf_norm::NONE
,
777 Xapian::TfIdfWeight::idf_norm::FREQ
,
778 Xapian::TfIdfWeight::wt_norm::NONE
));
779 mset
= enquire
.get_mset(0, 10);
780 TEST_EQUAL(mset
.size(), 2);
781 mset_expect_order(mset
, 2, 4);
782 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 / 2);
784 // check for NONE, SQUARE, NONE when termfreq != N
785 enquire
.set_query(query
);
786 enquire
.set_weighting_scheme(
788 Xapian::TfIdfWeight::wdf_norm::NONE
,
789 Xapian::TfIdfWeight::idf_norm::SQUARE
,
790 Xapian::TfIdfWeight::wt_norm::NONE
));
791 mset
= enquire
.get_mset(0, 10);
792 TEST_EQUAL(mset
.size(), 2);
793 mset_expect_order(mset
, 2, 4);
794 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
796 // Check for BOOLEAN, NONE, NONE and for both branches of BOOLEAN.
797 enquire
.set_query(Xapian::Query("test"));
798 enquire
.set_weighting_scheme(
800 Xapian::TfIdfWeight::wdf_norm::BOOLEAN
,
801 Xapian::TfIdfWeight::idf_norm::NONE
,
802 Xapian::TfIdfWeight::wt_norm::NONE
));
803 mset
= enquire
.get_mset(0, 10);
804 TEST_EQUAL(mset
.size(), 1);
805 mset_expect_order(mset
, 1);
806 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 1.0);
808 // Check for LOG, NONE, NONE and for both branches of LOG.
809 enquire
.set_query(Xapian::Query("word"));
810 enquire
.set_weighting_scheme(
812 Xapian::TfIdfWeight::wdf_norm::LOG
,
813 Xapian::TfIdfWeight::idf_norm::NONE
,
814 Xapian::TfIdfWeight::wt_norm::NONE
));
815 mset
= enquire
.get_mset(0, 10);
816 TEST_EQUAL(mset
.size(), 2);
817 mset_expect_order(mset
, 2, 4);
818 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 1 + log(8.0));
819 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1.0);
821 // Check for SQUARE, NONE, NONE.
822 enquire
.set_query(Xapian::Query("paragraph"));
823 enquire
.set_weighting_scheme(
825 Xapian::TfIdfWeight::wdf_norm::SQUARE
,
826 Xapian::TfIdfWeight::idf_norm::NONE
,
827 Xapian::TfIdfWeight::wt_norm::NONE
)); // idf=1 and tfn=tf*tf
828 mset
= enquire
.get_mset(0, 10);
829 TEST_EQUAL(mset
.size(), 5);
830 mset_expect_order(mset
, 2, 1, 4, 3, 5);
831 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 9.0);
832 TEST_EQUAL_DOUBLE(mset
[4].get_weight(), 1.0);
834 // Check for NONE, TFIDF, NONE when termfreq=N
835 enquire
.set_query(Xapian::Query("this"));
836 // N=termfreq and so idfn=0 for TFIDF
837 enquire
.set_weighting_scheme(
839 Xapian::TfIdfWeight::wdf_norm::NONE
,
840 Xapian::TfIdfWeight::idf_norm::TFIDF
,
841 Xapian::TfIdfWeight::wt_norm::NONE
));
842 mset
= enquire
.get_mset(0, 10);
843 TEST_EQUAL(mset
.size(), 6);
844 mset_expect_order(mset
, 1, 2, 3, 4, 5, 6);
845 for (int i
= 0; i
< 6; ++i
) {
846 TEST_EQUAL_DOUBLE(mset
[i
].get_weight(), 0.0);
849 // Check for NONE, PROB, NONE and for both branches of PROB
850 enquire
.set_query(Xapian::Query("this"));
851 // N=termfreq and so idfn=0 for PROB
852 enquire
.set_weighting_scheme(
854 Xapian::TfIdfWeight::wdf_norm::NONE
,
855 Xapian::TfIdfWeight::idf_norm::PROB
,
856 Xapian::TfIdfWeight::wt_norm::NONE
));
857 mset
= enquire
.get_mset(0, 10);
858 TEST_EQUAL(mset
.size(), 6);
859 mset_expect_order(mset
, 1, 2, 3, 4, 5, 6);
860 for (int i
= 0; i
< 6; ++i
) {
861 TEST_EQUAL_DOUBLE(mset
[i
].get_weight(), 0.0);
864 enquire
.set_query(Xapian::Query("word"));
865 enquire
.set_weighting_scheme(
867 Xapian::TfIdfWeight::wdf_norm::NONE
,
868 Xapian::TfIdfWeight::idf_norm::PROB
,
869 Xapian::TfIdfWeight::wt_norm::NONE
));
870 mset
= enquire
.get_mset(0, 10);
871 TEST_EQUAL(mset
.size(), 2);
872 mset_expect_order(mset
, 2, 4);
873 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * log((6.0 - 2) / 2));
874 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * log((6.0 - 2) / 2));
876 // Check for LOG_AVERAGE, NONE, NONE.
877 enquire
.set_query(Xapian::Query("word"));
878 enquire
.set_weighting_scheme(
880 Xapian::TfIdfWeight::wdf_norm::LOG_AVERAGE
,
881 Xapian::TfIdfWeight::idf_norm::NONE
,
882 Xapian::TfIdfWeight::wt_norm::NONE
));
883 mset
= enquire
.get_mset(0, 10);
884 TEST_EQUAL(mset
.size(), 2);
885 mset_expect_order(mset
, 2, 4);
886 TEST_EQUAL_DOUBLE(mset
[0].get_weight(),
887 (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
888 TEST_EQUAL_DOUBLE(mset
[1].get_weight(),
889 (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
891 // Check for AUG_LOG, NONE, NONE.
892 enquire
.set_weighting_scheme(
894 Xapian::TfIdfWeight::wdf_norm::AUG_LOG
,
895 Xapian::TfIdfWeight::idf_norm::NONE
,
896 Xapian::TfIdfWeight::wt_norm::NONE
));
897 mset
= enquire
.get_mset(0, 10);
898 TEST_EQUAL(mset
.size(), 2);
899 mset_expect_order(mset
, 2, 4);
900 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 0.2 + 0.8 * log(1.0 + 8));
901 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 0.2 + 0.8 * log(1.0 + 1));
903 // Check for NONE, GLOBAL_FREQ, NONE.
904 enquire
.set_weighting_scheme(
906 Xapian::TfIdfWeight::wdf_norm::NONE
,
907 Xapian::TfIdfWeight::idf_norm::GLOBAL_FREQ
,
908 Xapian::TfIdfWeight::wt_norm::NONE
));
909 mset
= enquire
.get_mset(0, 10);
910 TEST_EQUAL(mset
.size(), 2);
911 mset_expect_order(mset
, 2, 4);
912 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * (9.0 / 2));
913 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * (9.0 / 2));
915 // Check for SQRT, NONE, NONE.
916 enquire
.set_weighting_scheme(
918 Xapian::TfIdfWeight::wdf_norm::SQRT
,
919 Xapian::TfIdfWeight::idf_norm::NONE
,
920 Xapian::TfIdfWeight::wt_norm::NONE
));
921 mset
= enquire
.get_mset(0, 10);
922 TEST_EQUAL(mset
.size(), 2);
923 mset_expect_order(mset
, 2, 4);
924 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), sqrt(8 - 0.5) + 1);
925 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), sqrt(1 - 0.5) + 1);
927 // Check for NONE, LOG_GLOBAL_FREQ, NONE.
928 enquire
.set_weighting_scheme(
930 Xapian::TfIdfWeight::wdf_norm::NONE
,
931 Xapian::TfIdfWeight::idf_norm::LOG_GLOBAL_FREQ
,
932 Xapian::TfIdfWeight::wt_norm::NONE
));
933 mset
= enquire
.get_mset(0, 10);
934 TEST_EQUAL(mset
.size(), 2);
935 mset_expect_order(mset
, 2, 4);
936 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * log(9.0 / 2 + 1));
937 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * log(9.0 / 2 + 1));
939 // Check for NONE, INCREMENTED_GLOBAL_FREQ, NONE.
940 enquire
.set_weighting_scheme(
942 Xapian::TfIdfWeight::wdf_norm::NONE
,
943 Xapian::TfIdfWeight::idf_norm::INCREMENTED_GLOBAL_FREQ
,
944 Xapian::TfIdfWeight::wt_norm::NONE
));
945 mset
= enquire
.get_mset(0, 10);
946 TEST_EQUAL(mset
.size(), 2);
947 mset_expect_order(mset
, 2, 4);
948 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * (9.0 / 2 + 1));
949 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * (9.0 / 2 + 1));
951 // Check for NONE, SQRT_GLOBAL_FREQ, NONE.
952 enquire
.set_weighting_scheme(
954 Xapian::TfIdfWeight::wdf_norm::NONE
,
955 Xapian::TfIdfWeight::idf_norm::SQRT_GLOBAL_FREQ
,
956 Xapian::TfIdfWeight::wt_norm::NONE
));
957 mset
= enquire
.get_mset(0, 10);
958 TEST_EQUAL(mset
.size(), 2);
959 mset_expect_order(mset
, 2, 4);
960 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8 * sqrt(9.0 / 2 - 0.9));
961 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1 * sqrt(9.0 / 2 - 0.9));
963 // Check for AUG_AVERAGE, NONE, NONE.
964 enquire
.set_weighting_scheme(
966 Xapian::TfIdfWeight::wdf_norm::AUG_AVERAGE
,
967 Xapian::TfIdfWeight::idf_norm::NONE
,
968 Xapian::TfIdfWeight::wt_norm::NONE
));
969 mset
= enquire
.get_mset(0, 10);
970 TEST_EQUAL(mset
.size(), 2);
971 mset_expect_order(mset
, 2, 4);
972 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 0.9 + 0.1 * (8.0 / (81.0 / 56.0)));
973 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 0.9 + 0.1 * (1.0 / (31.0 / 26.0)));
975 // Check for MAX, NONE, NONE.
976 enquire
.set_weighting_scheme(
978 Xapian::TfIdfWeight::wdf_norm::MAX
,
979 Xapian::TfIdfWeight::idf_norm::NONE
,
980 Xapian::TfIdfWeight::wt_norm::NONE
));
981 mset
= enquire
.get_mset(0, 10);
982 TEST_EQUAL(mset
.size(), 2);
983 mset_expect_order(mset
, 2, 4);
984 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 8.0 / 8);
985 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 1.0 / 4);
987 // Check for AUG, NONE, NONE.
988 enquire
.set_weighting_scheme(
990 Xapian::TfIdfWeight::wdf_norm::AUG
,
991 Xapian::TfIdfWeight::idf_norm::NONE
,
992 Xapian::TfIdfWeight::wt_norm::NONE
));
993 mset
= enquire
.get_mset(0, 10);
994 TEST_EQUAL(mset
.size(), 2);
995 mset_expect_order(mset
, 2, 4);
996 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
997 TEST_EQUAL_DOUBLE(mset
[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
1000 // Feature tests for pivoted normalization functions.
1001 DEFINE_TESTCASE(tfidfweight4
, backend
) {
1002 Xapian::Database db
= get_database("apitest_simpledata");
1003 Xapian::Enquire
enquire(db
);
1004 Xapian::Query
query("paragraph");
1007 // Check for "PPn" normalization string.
1008 enquire
.set_query(query
);
1009 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1010 mset
= enquire
.get_mset(0, 10);
1011 TEST_EQUAL(mset
.size(), 5);
1012 // Shorter docs should ranker higher if wqf is equal among all the docs.
1013 TEST_REL(mset
[0].get_weight(),>,mset
[1].get_weight());
1014 TEST_REL(mset
[2].get_weight(),>,mset
[3].get_weight());
1016 // Check that wqf is taken into account.
1017 enquire
.set_query(Xapian::Query("paragraph", 2));
1018 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1019 Xapian::MSet mset2
= enquire
.get_mset(0, 10);
1020 TEST_EQUAL(mset2
.size(), 5);
1021 // wqf is 2, so weights should be doubled.
1022 TEST_EQUAL_DOUBLE(mset
[0].get_weight() * 2, mset2
[0].get_weight());
1023 TEST_EQUAL_DOUBLE(mset
[1].get_weight() * 2, mset2
[1].get_weight());
1025 // check for "nPn" which represents "xPx"
1026 enquire
.set_query(Xapian::Query("word"));
1027 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("nPn", 0.2, 1.0));
1028 mset
= enquire
.get_mset(0, 10);
1029 TEST_EQUAL(mset
.size(), 2);
1030 // Expect doc 2 with query "word" to have higher weight than doc 4.
1031 mset_expect_order(mset
, 2, 4);
1033 // check for "Ptn" which represents "Pxx"
1034 enquire
.set_query(Xapian::Query("word"));
1035 enquire
.set_weighting_scheme(Xapian::TfIdfWeight("Ptn", 0.2, 1.0));
1036 mset
= enquire
.get_mset(0, 10);
1037 TEST_EQUAL(mset
.size(), 2);
1038 // Expect doc 2 with query "word" to have higher weight than doc 4.
1039 mset_expect_order(mset
, 2, 4);
1041 // Check for PIVOTED, PIVOTED, NONE normalization string.
1042 enquire
.set_query(query
);
1043 enquire
.set_weighting_scheme(
1044 Xapian::TfIdfWeight(
1045 Xapian::TfIdfWeight::wdf_norm::PIVOTED
,
1046 Xapian::TfIdfWeight::idf_norm::PIVOTED
,
1047 Xapian::TfIdfWeight::wt_norm::NONE
));
1048 mset
= enquire
.get_mset(0, 10);
1049 TEST_EQUAL(mset
.size(), 5);
1050 // Shorter docs should ranker higher if wqf is equal among all the docs.
1051 TEST_REL(mset
[0].get_weight(),>,mset
[1].get_weight());
1052 TEST_REL(mset
[2].get_weight(),>,mset
[3].get_weight());
1054 // Check that wqf is taken into account.
1055 enquire
.set_query(Xapian::Query("paragraph", 2));
1056 mset2
= enquire
.get_mset(0, 10);
1057 TEST_EQUAL(mset2
.size(), 5);
1058 // wqf is 2, so weights should be doubled.
1059 TEST_EQUAL_DOUBLE(mset
[0].get_weight() * 2, mset2
[0].get_weight());
1060 TEST_EQUAL_DOUBLE(mset
[1].get_weight() * 2, mset2
[1].get_weight());
1062 // check for NONE, PIVOTED, NONE
1063 enquire
.set_query(Xapian::Query("word"));
1064 enquire
.set_weighting_scheme(
1065 Xapian::TfIdfWeight(
1066 Xapian::TfIdfWeight::wdf_norm::NONE
,
1067 Xapian::TfIdfWeight::idf_norm::PIVOTED
,
1068 Xapian::TfIdfWeight::wt_norm::NONE
));
1069 mset
= enquire
.get_mset(0, 10);
1070 TEST_EQUAL(mset
.size(), 2);
1071 // Expect doc 2 with query "word" to have higher weight than doc 4.
1072 mset_expect_order(mset
, 2, 4);
1074 // check for PIVOTED, TFIDF, NONE
1075 enquire
.set_query(Xapian::Query("word"));
1076 enquire
.set_weighting_scheme(
1077 Xapian::TfIdfWeight(
1078 Xapian::TfIdfWeight::wdf_norm::PIVOTED
,
1079 Xapian::TfIdfWeight::idf_norm::TFIDF
,
1080 Xapian::TfIdfWeight::wt_norm::NONE
));
1081 mset
= enquire
.get_mset(0, 10);
1082 TEST_EQUAL(mset
.size(), 2);
1083 // Expect doc 2 with query "word" to have higher weight than doc 4.
1084 mset_expect_order(mset
, 2, 4);
1087 // Check that create_from_parameters() creates the correct object.
1088 DEFINE_TESTCASE(tfidfweight5
, !backend
) {
1089 auto wt_ptr
= Xapian::Weight::create("tfidf NONE TFIDF NONE");
1090 auto wt
= Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::NONE
,
1091 Xapian::TfIdfWeight::idf_norm::TFIDF
,
1092 Xapian::TfIdfWeight::wt_norm::NONE
);
1093 TEST_EQUAL(wt_ptr
->serialise(), wt
.serialise());
1096 auto wt_ptr2
= Xapian::Weight::create("tfidf SQRT PIVOTED NONE");
1097 auto wt2
= Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::SQRT
,
1098 Xapian::TfIdfWeight::idf_norm::PIVOTED
,
1099 Xapian::TfIdfWeight::wt_norm::NONE
);
1100 TEST_EQUAL(wt_ptr2
->serialise(), wt2
.serialise());
1104 class CheckInitWeight
: public Xapian::Weight
{
1108 unsigned & zero_inits
, & non_zero_inits
;
1110 CheckInitWeight(unsigned &z
, unsigned &n
)
1111 : factor(-1.0), zero_inits(z
), non_zero_inits(n
) {
1112 need_stat(DOC_LENGTH
);
1115 void init(double factor_
) override
{
1123 Weight
* clone() const override
{
1124 return new CheckInitWeight(zero_inits
, non_zero_inits
);
1127 double get_sumpart(Xapian::termcount
, Xapian::termcount
,
1128 Xapian::termcount
, Xapian::termcount
) const override
{
1132 double get_maxpart() const override
{ return 1.0; }
1134 double get_sumextra(Xapian::termcount doclen
,
1136 Xapian::termcount
) const override
{
1137 return 1.0 / doclen
;
1140 double get_maxextra() const override
{ return 1.0; }
1143 /// Regression test - check init() is called for the term-indep Weight obj.
1144 DEFINE_TESTCASE(checkinitweight1
, backend
&& !multi
&& !remote
) {
1145 Xapian::Database db
= get_database("apitest_simpledata");
1146 Xapian::Enquire
enquire(db
);
1147 Xapian::Query
q(Xapian::Query::OP_AND
,
1148 Xapian::Query("this"), Xapian::Query("paragraph"));
1149 enquire
.set_query(q
);
1150 unsigned zero_inits
= 0, non_zero_inits
= 0;
1151 CheckInitWeight
wt(zero_inits
, non_zero_inits
);
1152 enquire
.set_weighting_scheme(wt
);
1153 Xapian::MSet mset
= enquire
.get_mset(0, 3);
1154 TEST_EQUAL(zero_inits
, 1);
1155 TEST_EQUAL(non_zero_inits
, 2);
1158 class CheckStatsWeight
: public Xapian::Weight
{
1160 double factor
= -1.0;
1162 Xapian::Database db
;
1166 // When testing OP_SYNONYM, term2 is also set.
1167 // When testing OP_WILDCARD, term2 == "*".
1168 // When testing a repeated term, term2 == "=" for the first occurrence and
1169 // "_" for subsequent occurrences.
1170 mutable string term2
;
1172 Xapian::termcount
& sum
;
1173 Xapian::termcount
& sum_squares
;
1175 mutable Xapian::termcount len_upper
= 0;
1176 mutable Xapian::termcount len_lower
= Xapian::termcount(-1);
1177 mutable Xapian::termcount uniqueterms_upper
= 0;
1178 mutable Xapian::termcount uniqueterms_lower
= Xapian::termcount(-1);
1179 mutable Xapian::termcount wdf_upper
= 0;
1181 CheckStatsWeight(const Xapian::Database
& db_
,
1182 const string
& term1_
,
1183 const string
& term2_
,
1184 Xapian::termcount
& sum_
,
1185 Xapian::termcount
& sum_squares_
)
1186 : db(db_
), term1(term1_
), term2(term2_
),
1187 sum(sum_
), sum_squares(sum_squares_
)
1189 need_stat(COLLECTION_SIZE
);
1190 need_stat(RSET_SIZE
);
1191 need_stat(AVERAGE_LENGTH
);
1192 need_stat(TERMFREQ
);
1193 need_stat(RELTERMFREQ
);
1194 need_stat(QUERY_LENGTH
);
1197 need_stat(DOC_LENGTH
);
1198 need_stat(DOC_LENGTH_MIN
);
1199 need_stat(DOC_LENGTH_MAX
);
1200 need_stat(DB_DOC_LENGTH_MIN
);
1201 need_stat(DB_DOC_LENGTH_MAX
);
1203 need_stat(COLLECTION_FREQ
);
1204 need_stat(UNIQUE_TERMS
);
1205 need_stat(UNIQUE_TERMS_MIN
);
1206 need_stat(UNIQUE_TERMS_MAX
);
1207 need_stat(DB_UNIQUE_TERMS_MIN
);
1208 need_stat(DB_UNIQUE_TERMS_MAX
);
1209 need_stat(TOTAL_LENGTH
);
1210 need_stat(WDF_DOC_MAX
);
1213 CheckStatsWeight(const Xapian::Database
& db_
,
1214 const string
& term_
,
1215 Xapian::termcount
& sum_
,
1216 Xapian::termcount
& sum_squares_
)
1217 : CheckStatsWeight(db_
, term_
, string(), sum_
, sum_squares_
) { }
1219 void init(double factor_
) override
{
1223 Weight
* clone() const override
{
1224 auto res
= new CheckStatsWeight(db
, term1
, term2
, sum
, sum_squares
);
1226 // The object passed to Enquire::set_weighting_scheme() is cloned
1227 // right away, and then cloned again for each term, and then
1228 // potentially once more for the term-independent weight
1229 // contribution. In the repeated case, we want to handle the first
1230 // actual term specially, so we arrange for that to have "=" for
1231 // term2, and subsequent clones to have "_", so that we accumulate
1232 // sum and sum_squares on the first occurrence only.
1238 double get_sumpart(Xapian::termcount wdf
,
1239 Xapian::termcount doclen
,
1240 Xapian::termcount uniqueterms
,
1241 Xapian::termcount wdfdocmax
) const override
{
1242 Xapian::doccount num_docs
= db
.get_doccount();
1243 TEST_EQUAL(get_collection_size(), num_docs
);
1244 TEST_EQUAL(get_rset_size(), 0);
1245 TEST_EQUAL(get_average_length(), db
.get_avlength());
1246 Xapian::totallength totlen
= get_total_length();
1247 TEST_EQUAL(totlen
, db
.get_total_length());
1248 double total_term_occurences
= get_average_length() * num_docs
;
1249 TEST_EQUAL(Xapian::totallength(total_term_occurences
+ 0.5), totlen
);
1250 if (term2
.empty() || term2
== "=" || term2
== "_") {
1251 TEST_EQUAL(get_termfreq(), db
.get_termfreq(term1
));
1252 TEST_EQUAL(get_collection_freq(), db
.get_collection_freq(term1
));
1253 if (term2
.empty()) {
1254 TEST_EQUAL(get_query_length(), 1);
1256 TEST_EQUAL(get_query_length(), 2);
1259 Xapian::doccount tfmax
= 0, tfsum
= 0;
1260 Xapian::termcount cfmax
= 0, cfsum
= 0;
1262 // OP_WILDCARD case.
1263 for (auto&& t
= db
.allterms_begin(term1
);
1264 t
!= db
.allterms_end(term1
); ++t
) {
1265 Xapian::doccount tf
= t
.get_termfreq();
1266 tout
<< "->" << *t
<< " " << tf
<< '\n';
1268 tfmax
= max(tfmax
, tf
);
1269 Xapian::termcount cf
= db
.get_collection_freq(*t
);
1271 cfmax
= max(cfmax
, cf
);
1273 TEST_EQUAL(get_query_length(), 1);
1276 Xapian::doccount tf1
= db
.get_termfreq(term1
);
1277 Xapian::doccount tf2
= db
.get_termfreq(term2
);
1279 tfmax
= max(tf1
, tf2
);
1280 Xapian::termcount cf1
= db
.get_collection_freq(term1
);
1281 Xapian::termcount cf2
= db
.get_collection_freq(term2
);
1283 cfmax
= max(cf1
, cf2
);
1284 TEST_EQUAL(get_query_length(), 2);
1286 // Synonym occurs at least as many times as any term.
1287 TEST_REL(get_termfreq(), >=, tfmax
);
1288 TEST_REL(get_collection_freq(), >=, cfmax
);
1289 // Synonym can't occur more times than the terms do.
1290 TEST_REL(get_termfreq(), <=, tfsum
);
1291 TEST_REL(get_collection_freq(), <=, cfsum
);
1292 // Synonym can't occur more times than there are documents/terms.
1293 TEST_REL(get_termfreq(), <=, num_docs
);
1294 TEST_REL(get_collection_freq(), <=, totlen
);
1296 TEST_EQUAL(get_reltermfreq(), 0);
1297 TEST_EQUAL(get_wqf(), 1);
1298 TEST_REL(doclen
,>=,len_lower
);
1299 TEST_REL(doclen
,<=,len_upper
);
1301 TEST_REL(uniqueterms
,>=,1);
1302 TEST_REL(uniqueterms_lower
,>=,1);
1303 TEST_REL(wdfdocmax
,>=,1);
1305 TEST_REL(uniqueterms
,>=,uniqueterms_lower
);
1306 TEST_REL(uniqueterms
,<=,uniqueterms_upper
);
1307 TEST_REL(uniqueterms
,<=,doclen
);
1308 TEST_REL(uniqueterms_upper
,<=,len_upper
);
1309 TEST_REL(wdf
,<=,wdf_upper
);
1310 TEST_REL(wdfdocmax
,<=,doclen
);
1311 TEST_REL(wdfdocmax
,>=,wdf
);
1313 auto db_len_lower
= db
.get_doclength_lower_bound();
1314 auto db_len_upper
= db
.get_doclength_upper_bound();
1315 auto db_uniqueterms_lower
= db
.get_unique_terms_lower_bound();
1316 auto db_uniqueterms_upper
= db
.get_unique_terms_upper_bound();
1317 TEST_EQUAL(get_db_doclength_lower_bound(), db_len_lower
);
1318 TEST_EQUAL(get_db_doclength_upper_bound(), db_len_upper
);
1319 TEST_EQUAL(get_db_unique_terms_lower_bound(), db_uniqueterms_lower
);
1320 TEST_EQUAL(get_db_unique_terms_upper_bound(), db_uniqueterms_upper
);
1321 if (db
.size() == 1) {
1322 TEST_EQUAL(len_lower
, db_len_lower
);
1323 TEST_EQUAL(len_upper
, db_len_upper
);
1324 TEST_EQUAL(uniqueterms_lower
, db_uniqueterms_lower
);
1325 TEST_EQUAL(uniqueterms_upper
, db_uniqueterms_upper
);
1327 TEST_REL(len_lower
,>=,db_len_lower
);
1328 TEST_REL(len_upper
,<=,db_len_upper
);
1329 TEST_REL(uniqueterms_lower
,>=,db_uniqueterms_lower
);
1330 TEST_REL(uniqueterms_upper
,<=,db_uniqueterms_upper
);
1334 sum_squares
+= wdf
* wdf
;
1339 double get_maxpart() const override
{
1340 if (len_upper
== 0) {
1341 len_lower
= get_doclength_lower_bound();
1342 len_upper
= get_doclength_upper_bound();
1343 uniqueterms_lower
= get_unique_terms_lower_bound();
1344 uniqueterms_upper
= get_unique_terms_upper_bound();
1345 wdf_upper
= get_wdf_upper_bound();
1350 double get_sumextra(Xapian::termcount doclen
,
1352 Xapian::termcount
) const override
{
1353 return 1.0 / doclen
;
1356 double get_maxextra() const override
{ return 1.0; }
1359 /// Check the weight subclass gets the correct stats.
1360 DEFINE_TESTCASE(checkstatsweight1
, backend
&& !remote
) {
1361 Xapian::Database db
= get_database("apitest_simpledata");
1362 Xapian::Enquire
enquire(db
);
1363 Xapian::TermIterator a
;
1364 for (a
= db
.allterms_begin(); a
!= db
.allterms_end(); ++a
) {
1365 const string
& term
= *a
;
1366 enquire
.set_query(Xapian::Query(term
));
1367 Xapian::termcount sum
= 0;
1368 Xapian::termcount sum_squares
= 0;
1369 CheckStatsWeight
wt(db
, term
, sum
, sum_squares
);
1370 enquire
.set_weighting_scheme(wt
);
1371 Xapian::MSet mset
= enquire
.get_mset(0, db
.get_doccount());
1373 // The document order in the multi-db case isn't the same as the
1374 // postlist order on the combined DB, so it's hard to compare the
1375 // wdf for each document in the Weight objects, but we can sum
1376 // the wdfs and the squares of the wdfs which provides a decent
1377 // check that we're not getting the wrong wdf values (it ensures
1378 // they have the right mean and standard deviation).
1379 Xapian::termcount expected_sum
= 0;
1380 Xapian::termcount expected_sum_squares
= 0;
1381 Xapian::PostingIterator i
;
1382 for (i
= db
.postlist_begin(term
); i
!= db
.postlist_end(term
); ++i
) {
1383 Xapian::termcount wdf
= i
.get_wdf();
1384 expected_sum
+= wdf
;
1385 expected_sum_squares
+= wdf
* wdf
;
1387 TEST_EQUAL(sum
, expected_sum
);
1388 TEST_EQUAL(sum_squares
, expected_sum_squares
);
1392 /// Check the weight subclass gets the correct stats with OP_SYNONYM.
1393 // Regression test for bugs fixed in 1.4.1.
1394 DEFINE_TESTCASE(checkstatsweight2
, backend
&& !remote
) {
1395 Xapian::Database db
= get_database("apitest_simpledata");
1396 Xapian::Enquire
enquire(db
);
1397 Xapian::TermIterator a
;
1398 for (a
= db
.allterms_begin(); a
!= db
.allterms_end(); ++a
) {
1399 const string
& term1
= *a
;
1400 if (++a
== db
.allterms_end()) break;
1401 const string
& term2
= *a
;
1402 Xapian::Query
q(Xapian::Query::OP_SYNONYM
,
1403 Xapian::Query(term1
), Xapian::Query(term2
));
1404 tout
<< q
.get_description() << '\n';
1405 enquire
.set_query(q
);
1406 Xapian::termcount sum
= 0;
1407 Xapian::termcount sum_squares
= 0;
1408 CheckStatsWeight
wt(db
, term1
, term2
, sum
, sum_squares
);
1409 enquire
.set_weighting_scheme(wt
);
1410 Xapian::MSet mset
= enquire
.get_mset(0, db
.get_doccount());
1412 // The document order in the multi-db case isn't the same as the
1413 // postlist order on the combined DB, so it's hard to compare the
1414 // wdf for each document in the Weight objects, but we can sum
1415 // the wdfs and the squares of the wdfs which provides a decent
1416 // check that we're not getting the wrong wdf values (it ensures
1417 // they have the right mean and standard deviation).
1418 Xapian::termcount expected_sum
= 0;
1419 Xapian::termcount expected_sum_squares
= 0;
1420 Xapian::PostingIterator i
= db
.postlist_begin(term1
);
1421 Xapian::PostingIterator j
= db
.postlist_begin(term2
);
1422 Xapian::docid did1
= *i
, did2
= *j
;
1424 // To calculate expected_sum_squares correctly we need to square
1425 // the sum per document.
1426 Xapian::termcount wdf
;
1428 wdf
= i
.get_wdf() + j
.get_wdf();
1430 } else if (did1
< did2
) {
1437 expected_sum
+= wdf
;
1438 expected_sum_squares
+= wdf
* wdf
;
1441 if (++i
!= db
.postlist_end(term1
)) {
1444 if (did2
== Xapian::docid(-1)) break;
1445 did1
= Xapian::docid(-1);
1449 if (++j
!= db
.postlist_end(term2
)) {
1452 if (did1
== Xapian::docid(-1)) break;
1453 did2
= Xapian::docid(-1);
1457 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1458 // the individual terms.
1459 TEST_EQUAL(sum
, expected_sum
);
1460 TEST_REL(sum_squares
, >=, expected_sum_squares
);
1464 /// Check the weight subclass gets the correct stats with OP_WILDCARD.
1465 // Regression test for bug fixed in 1.4.1.
1466 DEFINE_TESTCASE(checkstatsweight3
, backend
&& !remote
) {
1467 // The most correct thing to do would be to collate termfreqs across shards
1468 // for this, but if that's too hard to do efficiently we could at least
1469 // scale up the termfreqs proportional to the size of the shard.
1470 XFAIL_FOR_BACKEND("multi", "OP_WILDCARD+OP_SYNONYM use shard termfreqs");
1473 bool operator()(const Xapian::PostingIterator
& a
,
1474 const Xapian::PostingIterator
& b
) {
1479 Xapian::Database db
= get_database("apitest_simpledata");
1480 Xapian::Enquire
enquire(db
);
1481 Xapian::TermIterator a
;
1482 static const char * const testcases
[] = {
1483 "a", // a* matches all documents, but no term matches all.
1484 "pa", // Expands to only "paragraph", matching 5.
1485 "zulu", // No matches.
1486 "th", // Term "this" matches all documents.
1488 for (auto pattern
: testcases
) {
1489 Xapian::Query
q(Xapian::Query::OP_WILDCARD
, pattern
);
1490 tout
<< q
.get_description() << '\n';
1491 enquire
.set_query(q
);
1492 Xapian::termcount sum
= 0;
1493 Xapian::termcount sum_squares
= 0;
1494 CheckStatsWeight
wt(db
, pattern
, "*", sum
, sum_squares
);
1495 enquire
.set_weighting_scheme(wt
);
1496 Xapian::MSet mset
= enquire
.get_mset(0, db
.get_doccount());
1498 // The document order in the multi-db case isn't the same as the
1499 // postlist order on the combined DB, so it's hard to compare the
1500 // wdf for each document in the Weight objects, but we can sum
1501 // the wdfs and the squares of the wdfs which provides a decent
1502 // check that we're not getting the wrong wdf values (it ensures
1503 // they have the right mean and standard deviation).
1504 Xapian::termcount expected_sum
= 0;
1505 Xapian::termcount expected_sum_squares
= 0;
1506 vector
<Xapian::PostingIterator
> postlists
;
1507 for (auto&& t
= db
.allterms_begin(pattern
);
1508 t
!= db
.allterms_end(pattern
); ++t
) {
1509 postlists
.emplace_back(db
.postlist_begin(*t
));
1511 Heap::make(postlists
.begin(), postlists
.end(), PlCmp());
1512 Xapian::docid did
= 0;
1513 Xapian::termcount wdf
= 0;
1514 while (!postlists
.empty()) {
1515 Xapian::docid did_new
= *postlists
.front();
1516 Xapian::termcount wdf_new
= postlists
.front().get_wdf();
1517 if (++(postlists
.front()) == Xapian::PostingIterator()) {
1518 Heap::pop(postlists
.begin(), postlists
.end(), PlCmp());
1519 postlists
.pop_back();
1521 Heap::replace(postlists
.begin(), postlists
.end(), PlCmp());
1523 if (did_new
!= did
) {
1524 expected_sum
+= wdf
;
1525 expected_sum_squares
+= wdf
* wdf
;
1531 expected_sum
+= wdf
;
1532 expected_sum_squares
+= wdf
* wdf
;
1533 // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1534 // the individual terms.
1535 TEST_EQUAL(sum
, expected_sum
);
1536 TEST_REL(sum_squares
, >=, expected_sum_squares
);
1540 /// Check the stats for a repeated term are correct.
1541 // Regression test for bug fixed in 1.4.6. Doesn't work with
1542 // multi as the weight object is cloned more times.
1543 DEFINE_TESTCASE(checkstatsweight4
, backend
&& !remote
&& !multi
) {
1544 Xapian::Database db
= get_database("apitest_simpledata");
1545 Xapian::Enquire
enquire(db
);
1546 Xapian::TermIterator a
;
1547 for (a
= db
.allterms_begin(); a
!= db
.allterms_end(); ++a
) {
1548 const string
& term
= *a
;
1549 enquire
.set_query(Xapian::Query(term
, 1, 1) |
1550 Xapian::Query(term
, 1, 2));
1551 Xapian::termcount sum
= 0;
1552 Xapian::termcount sum_squares
= 0;
1553 CheckStatsWeight
wt(db
, term
, "=", sum
, sum_squares
);
1554 enquire
.set_weighting_scheme(wt
);
1555 Xapian::MSet mset
= enquire
.get_mset(0, db
.get_doccount());
1557 // The document order in the multi-db case isn't the same as the
1558 // postlist order on the combined DB, so it's hard to compare the
1559 // wdf for each document in the Weight objects, but we can sum
1560 // the wdfs and the squares of the wdfs which provides a decent
1561 // check that we're not getting the wrong wdf values (it ensures
1562 // they have the right mean and standard deviation).
1563 Xapian::termcount expected_sum
= 0;
1564 Xapian::termcount expected_sum_squares
= 0;
1565 Xapian::PostingIterator i
;
1566 for (i
= db
.postlist_begin(term
); i
!= db
.postlist_end(term
); ++i
) {
1567 Xapian::termcount wdf
= i
.get_wdf();
1568 expected_sum
+= wdf
;
1569 expected_sum_squares
+= wdf
* wdf
;
1571 TEST_EQUAL(sum
, expected_sum
);
1572 TEST_EQUAL(sum_squares
, expected_sum_squares
);
1576 class CheckStatsWeight5
: public Xapian::Weight
{
1578 mutable Xapian::docid did
= 0;
1582 Xapian::Database db
;
1587 CheckStatsWeight5(const Xapian::Database
& db_
, char stat_code_
= '\0')
1588 : factor(-1.0), db(db_
), stat_code(stat_code_
)
1590 switch (stat_code
) {
1595 need_stat(DOC_LENGTH
);
1598 need_stat(WDF_DOC_MAX
);
1601 void init(double factor_
) override
{
1605 Weight
* clone() const override
{
1606 return new CheckStatsWeight5(db
, stat_code
);
1609 double get_sumpart(Xapian::termcount
,
1612 Xapian::termcount wdfdocmax
) const override
{
1613 // The query is a synonym of all terms, so should match all documents.
1615 TEST_REL(wdfdocmax
,==,db
.get_doclength(did
));
1616 return 1.0 / wdfdocmax
;
1619 double get_maxpart() const override
{
1624 /// Check wdfdocmax is clamped to doclen even if wdf and doclen aren't wanted.
1625 DEFINE_TESTCASE(checkstatsweight5
, backend
&& !multi
&& !remote
) {
1626 Xapian::Database db
= get_database("apitest_simpledata");
1627 Xapian::Enquire
enquire(db
);
1628 Xapian::Query q
{Xapian::Query::OP_SYNONYM
,
1629 db
.allterms_begin(),
1631 enquire
.set_query(q
);
1632 enquire
.set_weighting_scheme(CheckStatsWeight5(db
));
1633 Xapian::MSet mset1
= enquire
.get_mset(0, db
.get_doccount());
1634 enquire
.set_weighting_scheme(CheckStatsWeight5(db
, 'w'));
1635 Xapian::MSet mset2
= enquire
.get_mset(0, db
.get_doccount());
1636 enquire
.set_weighting_scheme(CheckStatsWeight5(db
, 'd'));
1637 Xapian::MSet mset3
= enquire
.get_mset(0, db
.get_doccount());
1640 // Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
1641 DEFINE_TESTCASE(unigramlmweight4
, backend
) {
1642 Xapian::Database db
= get_database("apitest_simpledata");
1643 Xapian::Enquire
enquire1(db
);
1644 Xapian::Enquire
enquire2(db
);
1645 enquire1
.set_query(Xapian::Query("paragraph"));
1647 enquire2
.set_query(Xapian::Query("paragraph"));
1649 // 5 documents available with term paragraph so mset size should be 5
1650 enquire1
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING
, 1, 0));
1651 enquire2
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING
, 1, 0));
1652 mset1
= enquire1
.get_mset(0, 10);
1653 mset2
= enquire2
.get_mset(0, 10);
1655 TEST_EQUAL(mset1
.size(), 5);
1656 TEST_EQUAL_DOUBLE(mset1
[1].get_weight(), mset2
[1].get_weight());
1659 /* Test for checking if we don't use smoothing all
1660 * of them should give same result i.e wdf_double/len_double */
1661 DEFINE_TESTCASE(unigramlmweight5
, backend
) {
1662 Xapian::Database db
= get_database("apitest_simpledata");
1663 Xapian::Enquire
enquire1(db
);
1664 Xapian::Enquire
enquire2(db
);
1665 Xapian::Enquire
enquire3(db
);
1666 Xapian::Enquire
enquire4(db
);
1667 enquire1
.set_query(Xapian::Query("paragraph"));
1669 enquire2
.set_query(Xapian::Query("paragraph"));
1671 enquire3
.set_query(Xapian::Query("paragraph"));
1673 enquire4
.set_query(Xapian::Query("paragraph"));
1675 // 5 documents available with term paragraph so mset size should be 5
1676 enquire1
.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING
, 0, 0));
1677 enquire2
.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING
, 0, 0));
1678 enquire3
.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING
, 0, 0));
1679 enquire4
.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING
, 0, 0));
1681 mset1
= enquire1
.get_mset(0, 10);
1682 mset2
= enquire2
.get_mset(0, 10);
1683 mset3
= enquire3
.get_mset(0, 10);
1684 mset4
= enquire4
.get_mset(0, 10);
1686 TEST_EQUAL(mset1
.size(), 5);
1687 TEST_EQUAL(mset2
.size(), 5);
1688 TEST_EQUAL(mset3
.size(), 5);
1689 TEST_EQUAL(mset4
.size(), 5);
1690 for (Xapian::doccount i
= 0; i
< 5; ++i
) {
1691 TEST_EQUAL_DOUBLE(mset3
[i
].get_weight(), mset4
[i
].get_weight());
1692 TEST_EQUAL_DOUBLE(mset2
[i
].get_weight(), mset4
[i
].get_weight());
1693 TEST_EQUAL_DOUBLE(mset1
[i
].get_weight(), mset2
[i
].get_weight());
1694 TEST_EQUAL_DOUBLE(mset3
[i
].get_weight(), mset2
[i
].get_weight());
1695 TEST_EQUAL_DOUBLE(mset1
[i
].get_weight(), mset4
[i
].get_weight());
1696 TEST_EQUAL_DOUBLE(mset1
[i
].get_weight(), mset3
[i
].get_weight());
1700 // Feature test for Dir+ function.
1701 DEFINE_TESTCASE(unigramlmweight7
, backend
) {
1702 Xapian::Database db
= get_database("apitest_simpledata");
1703 Xapian::Enquire
enquire1(db
);
1704 Xapian::Enquire
enquire2(db
);
1705 enquire1
.set_query(Xapian::Query("paragraph"));
1706 enquire2
.set_query(Xapian::Query("paragraph"));
1710 enquire1
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING
, 2000, 0));
1711 enquire2
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
, 2000, 0.05));
1713 mset1
= enquire1
.get_mset(0, 10);
1714 mset2
= enquire2
.get_mset(0, 10);
1716 // mset size should be 5
1717 TEST_EQUAL(mset1
.size(), 5);
1718 TEST_EQUAL(mset2
.size(), 5);
1720 // Expect mset weights associated with Dir+ more than mset weights by Dir
1721 // because of the presence of extra weight component in Dir+ function.
1722 TEST_REL(mset2
[0].get_weight(),>,mset1
[0].get_weight());
1723 TEST_REL(mset2
[1].get_weight(),>,mset1
[1].get_weight());
1724 TEST_REL(mset2
[2].get_weight(),>,mset1
[2].get_weight());
1725 TEST_REL(mset2
[3].get_weight(),>,mset1
[3].get_weight());
1726 TEST_REL(mset2
[4].get_weight(),>,mset1
[4].get_weight());
1729 // Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
1730 DEFINE_TESTCASE(unigramlmweight8
, backend
) {
1731 Xapian::Database db
= get_database("apitest_simpledata");
1732 Xapian::Enquire
enquire(db
);
1733 Xapian::Query
query("paragraph");
1735 enquire
.set_query(query
);
1736 enquire
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING
, 2000, 0));
1739 mset1
= enquire
.get_mset(0, 10);
1740 TEST_EQUAL(mset1
.size(), 5);
1742 enquire
.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT
, query
, 15.0));
1743 enquire
.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING
, 2000, 0));
1746 mset2
= enquire
.get_mset(0, 10);
1747 TEST_EQUAL(mset2
.size(), mset1
.size());
1748 TEST_NOT_EQUAL_DOUBLE(mset1
[0].get_weight(), 0.0);
1749 for (Xapian::doccount i
= 0; i
< mset1
.size(); ++i
) {
1750 TEST_EQUAL_DOUBLE(15.0 * mset1
[i
].get_weight(), mset2
[i
].get_weight());
1754 // Feature test for CoordWeight.
1755 DEFINE_TESTCASE(coordweight1
, backend
) {
1756 Xapian::Enquire
enquire(get_database("apitest_simpledata"));
1757 enquire
.set_weighting_scheme(Xapian::CoordWeight());
1758 static const char * const terms
[] = {
1759 "this", "line", "paragraph", "rubbish"
1761 Xapian::Query
query(Xapian::Query::OP_OR
, terms
, std::end(terms
));
1762 enquire
.set_query(query
);
1763 Xapian::MSet mymset1
= enquire
.get_mset(0, 100);
1764 // CoordWeight scores 1 for each matching term, so the weight should equal
1765 // the number of matching terms.
1766 for (Xapian::MSetIterator i
= mymset1
.begin(); i
!= mymset1
.end(); ++i
) {
1767 Xapian::termcount matching_terms
= 0;
1768 Xapian::TermIterator t
= enquire
.get_matching_terms_begin(i
);
1769 while (t
!= enquire
.get_matching_terms_end(i
)) {
1773 TEST_EQUAL(i
.get_weight(), matching_terms
);
1778 DEFINE_TESTCASE(dicecoeffweight2
, backend
) {
1779 Xapian::Database db
= get_database("apitest_simpledata3");
1780 Xapian::Enquire
enquire(db
);
1781 static const char * const terms
[] = {
1784 Xapian::Query
query(Xapian::Query::OP_OR
, terms
, std::end(terms
));
1785 enquire
.set_query(query
);
1786 enquire
.set_weighting_scheme(Xapian::DiceCoeffWeight());
1789 mset1
= enquire
.get_mset(0, 10);
1790 TEST_EQUAL(mset1
.size(), 4);
1792 /* The weight value has been manually calculated by using the statistics
1793 * of the test database. */
1794 TEST_EQUAL_DOUBLE(mset1
[0].get_weight(), 0.571428571428571);
1795 TEST_EQUAL_DOUBLE(mset1
[1].get_weight(), 0.5);
1796 TEST_EQUAL_DOUBLE(mset1
[2].get_weight(), 0.2);
1797 TEST_EQUAL_DOUBLE(mset1
[3].get_weight(), 0.181818181818182);
1800 // Test handling of a term with zero wdf.
1801 DEFINE_TESTCASE(dicecoeffweight3
, backend
) {
1802 Xapian::Database db
= get_database("dicecoeffweight3",
1803 [](Xapian::WritableDatabase
& wdb
,
1805 Xapian::Document doc
;
1806 doc
.add_term("radio", 2);
1807 doc
.add_term("seahorse");
1808 doc
.add_term("zebra");
1809 doc
.add_boolean_term("false");
1810 doc
.add_boolean_term("true");
1811 wdb
.add_document(doc
);
1813 Xapian::Enquire
enquire(db
);
1814 enquire
.set_weighting_scheme(Xapian::DiceCoeffWeight());
1816 // OP_SYNONYM gives wdf zero is need_stat(WDF) isn't specified (and
1817 // it isn't by DiceCoeffWeight).
1818 Xapian::Query
q(Xapian::Query::OP_SYNONYM
,
1819 Xapian::Query("false"), Xapian::Query("true"));
1820 enquire
.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT
,
1822 Xapian::MSet mset
= enquire
.get_mset(0, 10);
1823 TEST_EQUAL(mset
.size(), 1);
1825 // factor * 2.0 * wqf / (query_length + unique_term_count)
1826 // = 6.0 * 2.0 * 1 / (2 + 4) = 2.0
1827 TEST_EQUAL_DOUBLE(mset
[0].get_weight(), 2.0);