xapian-core/tests/api_weight.cc

   1 /** @file
   2  * @brief tests of Xapian::Weight subclasses
   3  */
   4 /* Copyright (C) 2004-2024 Olly Betts
   5  * Copyright (C) 2013 Aarsh Shah
   6  * Copyright (C) 2016 Vivek Pal
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  21  */
  22
  23 #include <config.h>
  24
  25 #include "api_weight.h"
  26 #include <cmath>
  27 #include <memory>
  28
  29 #include <xapian.h>
  30
  31 #include "apitest.h"
  32 #include "heap.h"
  33 #include "testutils.h"
  34
  35 using namespace std;
  36
  37 template<class W>
  38 static inline void
  39 test_weight_class_no_params(const char* name)
  40 {
  41     tout << name << '\n';
  42     W obj;
  43     // Check name() returns the class name.
  44     TEST_EQUAL(obj.name(), name);
  45     // If there are no parameters, there's nothing to serialise.
  46     string obj_serialised = obj.serialise();
  47     TEST_EQUAL(obj_serialised.size(), 0);
  48     // Check serialising and unserialising gives object with same serialisation.
  49     unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
  50     TEST_EQUAL(obj_serialised, wt->serialise());
  51     // Check that unserialise() throws suitable error for bad serialisation.
  52     // The easy case to test is extra junk after the serialised weight.
  53     try {
  54         unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
  55         FAIL_TEST(name << " did not throw for unserialise with junk appended");
  56     } catch (const Xapian::SerialisationError& e) {
  57         // Check the exception message contains the weighting scheme name
  58         // (regression test for TradWeight's exception saying "BM25").
  59         string target = name + CONST_STRLEN("Xapian::");
  60         TEST(e.get_msg().find(target) != string::npos);
  61     }
  62 }
  63
  64 #define TEST_WEIGHT_CLASS_NO_PARAMS(W) test_weight_class_no_params<W>(#W)
  65
  66 template<class W>
  67 static inline void
  68 test_weight_class(const char* name, const W& obj_default, const W& obj_other)
  69 {
  70     tout << name << '\n';
  71     W obj;
  72     // Check name() returns the class name.
  73     TEST_EQUAL(obj.name(), name);
  74     TEST_EQUAL(obj_default.name(), name);
  75     TEST_EQUAL(obj_other.name(), name);
  76     // Check serialisation matches that of object constructed with explicit
  77     // parameter values of what the defaults are meant to be.
  78     string obj_serialised = obj.serialise();
  79     TEST_EQUAL(obj_serialised, obj_default.serialise());
  80     // Check serialisation is different to object with different parameters.
  81     string obj_other_serialised = obj_other.serialise();
  82     TEST_NOT_EQUAL(obj_serialised, obj_other_serialised);
  83     // Check serialising and unserialising gives object with same serialisation.
  84     unique_ptr<Xapian::Weight> wt(W().unserialise(obj_serialised));
  85     TEST_EQUAL(obj_serialised, wt->serialise());
  86     // Check serialising and unserialising of object with different parameters.
  87     unique_ptr<Xapian::Weight> wt2(W().unserialise(obj_other_serialised));
  88     TEST_EQUAL(obj_other_serialised, wt2->serialise());
  89     // Check that unserialise() throws suitable error for bad serialisation.
  90     // The easy case to test is extra junk after the serialised weight.
  91     try {
  92         unique_ptr<Xapian::Weight> bad(W().unserialise(obj_serialised + "X"));
  93         FAIL_TEST(name << " did not throw for unserialise with junk appended");
  94     } catch (const Xapian::SerialisationError& e) {
  95         // Check the exception message contains the weighting scheme name
  96         // (regression test for TradWeight's exception saying "BM25").
  97         string target = name + CONST_STRLEN("Xapian::");
  98         TEST(e.get_msg().find(target) != string::npos);
  99     }
 100 }
 101
 102 // W Should be the class name.
 103 //
 104 // DEFAULT should be a parenthesised parameter list to explicitly construct
 105 // an object of class W with the documented default parameters.
 106 //
 107 // OTHER should be a parenthesised parameter list to construct an object with
 108 // non-default parameters.
 109 #define TEST_WEIGHT_CLASS(W, DEFAULT, OTHER) \
 110     test_weight_class<W>(#W, W DEFAULT, W OTHER)
 111
 112 /// Test serialisation and introspection of built-in weighting schemes.
 113 DEFINE_TESTCASE(weightserialisation1, !backend) {
 114     // Parameter-free weighting schemes.
 115     TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::BoolWeight);
 116     TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::CoordWeight);
 117     TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DLHWeight);
 118     TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DPHWeight);
 119     TEST_WEIGHT_CLASS_NO_PARAMS(Xapian::DiceCoeffWeight);
 120
 121     // Parameterised weighting schemes.
 122     TEST_WEIGHT_CLASS(Xapian::TradWeight, (1.0), (2.0));
 123     TEST_WEIGHT_CLASS(Xapian::BM25Weight,
 124                       (1, 0, 1, 0.5, 0.5),
 125                       (1, 0.5, 1, 0.5, 0.5));
 126     TEST_WEIGHT_CLASS(Xapian::BM25PlusWeight,
 127                       (1, 0, 1, 0.5, 0.5, 1.0),
 128                       (1, 0, 1, 0.5, 0.5, 2.0));
 129     TEST_WEIGHT_CLASS(Xapian::TfIdfWeight, ("ntn"), ("bpn"));
 130     TEST_WEIGHT_CLASS(Xapian::InL2Weight, (1.0), (2.0));
 131     TEST_WEIGHT_CLASS(Xapian::IfB2Weight, (1.0), (2.0));
 132     TEST_WEIGHT_CLASS(Xapian::IneB2Weight, (1.0), (2.0));
 133     TEST_WEIGHT_CLASS(Xapian::BB2Weight, (1.0), (2.0));
 134     TEST_WEIGHT_CLASS(Xapian::PL2Weight, (1.0), (2.0));
 135     TEST_WEIGHT_CLASS(Xapian::PL2PlusWeight,
 136                       (1.0, 0.8),
 137                       (2.0, 0.9));
 138     TEST_WEIGHT_CLASS(Xapian::LMWeight,
 139                       (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0.7, 2000.0),
 140                       (0.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0.7));
 141 }
 142
 143 /// Basic test of using weighting schemes.
 144 DEFINE_TESTCASE(weight1, backend) {
 145     Xapian::Database db(get_database("etext"));
 146     Xapian::Enquire enquire(db);
 147     Xapian::Enquire enquire_scaled(db);
 148     auto term = "robinson";
 149     Xapian::Query q{term};
 150     enquire.set_query(q);
 151     enquire_scaled.set_query(q * 15.0);
 152     auto expected_matches = db.get_termfreq(term);
 153     auto helper = [&](const Xapian::Weight& weight,
 154                       string_view name,
 155                       string_view params) {
 156         tout << name << '(' << params << ")\n";
 157         enquire.set_weighting_scheme(weight);
 158         enquire_scaled.set_weighting_scheme(weight);
 159         Xapian::MSet mset = enquire.get_mset(0, expected_matches + 1);
 160         TEST_EQUAL(mset.size(), expected_matches);
 161         if (name == "Xapian::BoolWeight") {
 162             /* All weights should be zero. */
 163             TEST_EQUAL(mset[0].get_weight(), 0.0);
 164             TEST_EQUAL(mset.back().get_weight(), 0.0);
 165         } else if (name == "Xapian::CoordWeight") {
 166             /* All weights should be 1 for a single term query. */
 167             TEST_EQUAL(mset[0].get_weight(), 1.0);
 168             TEST_EQUAL(mset.back().get_weight(), 1.0);
 169         } else if (!params.empty()) {
 170             /* All weights should be equal with these particular parameters. */
 171             TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
 172             TEST_EQUAL(mset[0].get_weight(), mset.back().get_weight());
 173         } else {
 174             TEST_NOT_EQUAL(mset[0].get_weight(), 0.0);
 175             TEST_NOT_EQUAL(mset[0].get_weight(), mset.back().get_weight());
 176         }
 177         Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches);
 178         TEST_EQUAL(mset_scaled.size(), expected_matches);
 179         for (Xapian::doccount i = 0; i < expected_matches; ++i) {
 180             TEST_EQUAL_DOUBLE(mset_scaled[i].get_weight(),
 181                               mset[i].get_weight() * 15.0);
 182         }
 183     };
 184 #define TEST_WEIGHTING_SCHEME(W, ...) helper(W(__VA_ARGS__), #W, #__VA_ARGS__)
 185     TEST_WEIGHTING_SCHEME(Xapian::BoolWeight);
 186     TEST_WEIGHTING_SCHEME(Xapian::CoordWeight);
 187     TEST_WEIGHTING_SCHEME(Xapian::DLHWeight);
 188     TEST_WEIGHTING_SCHEME(Xapian::DPHWeight);
 189     TEST_WEIGHTING_SCHEME(Xapian::DiceCoeffWeight);
 190     TEST_WEIGHTING_SCHEME(Xapian::TradWeight);
 191     TEST_WEIGHTING_SCHEME(Xapian::BM25Weight);
 192     TEST_WEIGHTING_SCHEME(Xapian::BM25PlusWeight);
 193     TEST_WEIGHTING_SCHEME(Xapian::TfIdfWeight);
 194     TEST_WEIGHTING_SCHEME(Xapian::InL2Weight);
 195     TEST_WEIGHTING_SCHEME(Xapian::IfB2Weight);
 196     TEST_WEIGHTING_SCHEME(Xapian::IneB2Weight);
 197     TEST_WEIGHTING_SCHEME(Xapian::BB2Weight);
 198     TEST_WEIGHTING_SCHEME(Xapian::PL2Weight);
 199     TEST_WEIGHTING_SCHEME(Xapian::PL2PlusWeight);
 200     TEST_WEIGHTING_SCHEME(Xapian::LMWeight);
 201     // Regression test for bug fixed in 1.2.4.
 202     TEST_WEIGHTING_SCHEME(Xapian::BM25Weight, 0, 0, 0, 0, 1);
 203     /* As mentioned in the documentation, when parameter k is 0, wdf and
 204      * document length don't affect the weights.  Regression test for bug fixed
 205      * in 1.2.4.
 206      */
 207     TEST_WEIGHTING_SCHEME(Xapian::TradWeight, 0);
 208 #undef TEST_WEIGHTING_SCHEME
 209 }
 210
 211 /** Regression test for bug fixed in 1.0.5.
 212  *
 213  * This test would fail under valgrind because it used an uninitialised value.
 214  */
 215 DEFINE_TESTCASE(bm25weight1, backend) {
 216     Xapian::Enquire enquire(get_database("apitest_simpledata"));
 217     enquire.set_weighting_scheme(Xapian::BM25Weight(1, 25, 1, 0.01, 0.5));
 218     enquire.set_query(Xapian::Query("word"));
 219
 220     Xapian::MSet mset = enquire.get_mset(0, 25);
 221 }
 222
 223 // Test parameter combinations which should be unaffected by doclength.
 224 DEFINE_TESTCASE(bm25weight4, backend) {
 225     Xapian::Database db = get_database("apitest_simpledata");
 226     Xapian::Enquire enquire(db);
 227     enquire.set_query(Xapian::Query("paragraph"));
 228     Xapian::MSet mset;
 229
 230     enquire.set_weighting_scheme(Xapian::BM25Weight(1, 0, 1, 0, 0.5));
 231     mset = enquire.get_mset(0, 10);
 232     TEST_EQUAL(mset.size(), 5);
 233     // Expect: wdf has an effect on weight, but doclen doesn't.
 234     TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
 235     TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
 236     TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
 237     TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
 238
 239     enquire.set_weighting_scheme(Xapian::BM25Weight(0, 0, 1, 1, 0.5));
 240     mset = enquire.get_mset(0, 10);
 241     TEST_EQUAL(mset.size(), 5);
 242     // Expect: neither wdf nor doclen affects weight.
 243     TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
 244 }
 245
 246 /// Test non-zero k2 with zero k1.
 247 // Regression test for bug fixed in 1.2.17 and 1.3.2.
 248 DEFINE_TESTCASE(bm25weight5, backend) {
 249     Xapian::Database db = get_database("apitest_simpledata");
 250     Xapian::Enquire enquire(db);
 251     enquire.set_query(Xapian::Query("paragraph"));
 252     Xapian::MSet mset;
 253
 254     enquire.set_weighting_scheme(Xapian::BM25Weight(0, 1, 1, 0.5, 0.5));
 255     mset = enquire.get_mset(0, 10);
 256     TEST_EQUAL(mset.size(), 5);
 257     // Expect: wdf has no effect on weight; shorter docs rank higher.
 258     mset_expect_order(mset, 3, 5, 1, 4, 2);
 259     TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[1].get_weight());
 260     TEST_REL(mset[1].get_weight(),>,mset[2].get_weight());
 261     TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
 262     TEST_REL(mset[3].get_weight(),>,mset[4].get_weight());
 263 }
 264
 265 // Test parameter combinations which should be unaffected by doclength.
 266 DEFINE_TESTCASE(bm25plusweight2, backend) {
 267     Xapian::Database db = get_database("apitest_simpledata");
 268     Xapian::Enquire enquire(db);
 269     enquire.set_query(Xapian::Query("paragraph"));
 270     Xapian::MSet mset;
 271
 272     enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0, 0.5, 1));
 273     mset = enquire.get_mset(0, 10);
 274     TEST_EQUAL(mset.size(), 5);
 275     // Expect: wdf has an effect on weight, but doclen doesn't.
 276     TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
 277     TEST_EQUAL_DOUBLE(mset[1].get_weight(), mset[2].get_weight());
 278     TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
 279     TEST_EQUAL_DOUBLE(mset[3].get_weight(), mset[4].get_weight());
 280
 281     enquire.set_weighting_scheme(Xapian::BM25PlusWeight(0, 0, 1, 1, 0.5, 1));
 282     mset = enquire.get_mset(0, 10);
 283     TEST_EQUAL(mset.size(), 5);
 284     // Expect: neither wdf nor doclen affects weight.
 285     TEST_EQUAL_DOUBLE(mset[0].get_weight(), mset[4].get_weight());
 286 }
 287
 288 // Regression test for a mistake corrected in the BM25+ implementation.
 289 DEFINE_TESTCASE(bm25plusweight3, backend) {
 290     Xapian::Database db = get_database("apitest_simpledata");
 291     Xapian::Enquire enquire(db);
 292     enquire.set_query(Xapian::Query("paragraph"));
 293     Xapian::MSet mset;
 294
 295     enquire.set_weighting_scheme(Xapian::BM25PlusWeight(1, 0, 1, 0.5, 0.5, 1));
 296     mset = enquire.get_mset(0, 10);
 297     TEST_EQUAL(mset.size(), 5);
 298
 299     // The value of each doc weight calculated manually from the BM25+ formulae
 300     // by using the respective document statistics.
 301     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.7920796567487473);
 302     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.7846980783848447);
 303     TEST_EQUAL_DOUBLE(mset[2].get_weight(), 0.7558817623365934);
 304     TEST_EQUAL_DOUBLE(mset[3].get_weight(), 0.7210119356168847);
 305     TEST_EQUAL_DOUBLE(mset[4].get_weight(), 0.7210119356168847);
 306 }
 307
 308
 309 // Test for invalid values of c.
 310 DEFINE_TESTCASE(inl2weight2, !backend) {
 311     // InvalidArgumentError should be thrown if the parameter c is invalid.
 312     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 313         Xapian::InL2Weight wt(-2.0));
 314
 315     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 316         Xapian::InL2Weight wt2(0.0));
 317 }
 318
 319 // Feature tests for Inl2Weight
 320 DEFINE_TESTCASE(inl2weight3, backend) {
 321     Xapian::Database db = get_database("apitest_simpledata");
 322     Xapian::Enquire enquire(db);
 323     Xapian::Query query("banana");
 324
 325     enquire.set_query(query);
 326     enquire.set_weighting_scheme(Xapian::InL2Weight(2.0));
 327
 328     Xapian::MSet mset1;
 329     mset1 = enquire.get_mset(0, 10);
 330     TEST_EQUAL(mset1.size(), 1);
 331     mset_expect_order(mset1, 6);
 332
 333     /* The value has been calculated in the python interpreter by looking at the
 334      * database statistics. */
 335     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.559711143842063);
 336 }
 337
 338 // Test for invalid values of c.
 339 DEFINE_TESTCASE(ifb2weight2, !backend) {
 340     // InvalidArgumentError should be thrown if the parameter c is invalid.
 341     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 342         Xapian::IfB2Weight wt(-2.0));
 343
 344     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 345         Xapian::IfB2Weight wt2(0.0));
 346 }
 347
 348 // Feature test
 349 DEFINE_TESTCASE(ifb2weight3, backend) {
 350     Xapian::Database db = get_database("apitest_simpledata");
 351     Xapian::Enquire enquire(db);
 352     Xapian::Query query("banana");
 353
 354     enquire.set_query(query);
 355     enquire.set_weighting_scheme(Xapian::IfB2Weight(2.0));
 356
 357     Xapian::MSet mset1;
 358     mset1 = enquire.get_mset(0, 10);
 359     TEST_EQUAL(mset1.size(), 1);
 360
 361     /* The value of the weight has been manually calculated using the statistics
 362      * of the test database. */
 363     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.119422287684126);
 364 }
 365
 366 // Test for invalid values of c.
 367 DEFINE_TESTCASE(ineb2weight2, !backend) {
 368     // InvalidArgumentError should be thrown if parameter c is invalid.
 369     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 370         Xapian::IneB2Weight wt(-2.0));
 371
 372     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 373         Xapian::IneB2Weight wt2(0.0));
 374 }
 375
 376 // Feature test.
 377 DEFINE_TESTCASE(ineb2weight3, backend) {
 378     Xapian::Database db = get_database("apitest_simpledata");
 379     Xapian::Enquire enquire(db);
 380     Xapian::Query query("paragraph");
 381     enquire.set_query(query);
 382     enquire.set_weighting_scheme(Xapian::IneB2Weight(2.0));
 383
 384     Xapian::MSet mset1;
 385     mset1 = enquire.get_mset(0, 10);
 386     TEST_EQUAL(mset1.size(), 5);
 387
 388     // The third document in the database is 4th in the ranking.
 389     /* The weight value has been manually calculated by using the statistics
 390      * of the test database. */
 391     TEST_EQUAL_DOUBLE(mset1[4].get_weight(), 0.61709730297692400036);
 392 }
 393
 394 // Test for invalid values of c.
 395 DEFINE_TESTCASE(bb2weight2, !backend) {
 396     // InvalidArgumentError should be thrown if the parameter c is invalid.
 397     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 398         Xapian::BB2Weight wt(-2.0));
 399
 400     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 401         Xapian::BB2Weight wt2(0.0));
 402 }
 403
 404 // Feature test
 405 DEFINE_TESTCASE(bb2weight3, backend) {
 406     Xapian::Database db = get_database("apitest_simpledata");
 407     Xapian::Enquire enquire(db);
 408     Xapian::Query query("paragraph");
 409
 410     enquire.set_query(query);
 411     enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
 412
 413     Xapian::MSet mset1;
 414     mset1 = enquire.get_mset(0, 10);
 415     TEST_EQUAL(mset1.size(), 5);
 416     /* The third document in the database has the highest weight and is the
 417      * first in the mset. */
 418     // Value calculated manually by using the statistics of the test database.
 419     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.6823696969784483);
 420
 421     // Test with OP_SCALE_WEIGHT and a small factor (regression test, as we
 422     // were applying the factor to the upper bound twice).
 423     enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 1.0 / 1024));
 424     enquire.set_weighting_scheme(Xapian::BB2Weight(2.0));
 425
 426     Xapian::MSet mset3;
 427     mset3 = enquire.get_mset(0, 10);
 428     TEST_EQUAL(mset3.size(), 5);
 429
 430     for (int i = 0; i < 5; ++i) {
 431         TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight() * 1024);
 432     }
 433 }
 434
 435 // Regression test: we used to calculate log2(0) when there was only one doc.
 436 DEFINE_TESTCASE(bb2weight4, backend) {
 437     Xapian::Database db = get_database("apitest_onedoc");
 438     Xapian::Enquire enquire(db);
 439     Xapian::Query query("word");
 440
 441     enquire.set_query(query);
 442     enquire.set_weighting_scheme(Xapian::BB2Weight());
 443
 444     Xapian::MSet mset1;
 445     mset1 = enquire.get_mset(0, 10);
 446     TEST_EQUAL(mset1.size(), 1);
 447     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 3.431020621347435);
 448 }
 449
 450 // Feature test.
 451 DEFINE_TESTCASE(dlhweight1, backend) {
 452     Xapian::Database db = get_database("apitest_simpledata");
 453     Xapian::Enquire enquire(db);
 454     Xapian::Query query("a");
 455
 456     enquire.set_query(query);
 457     enquire.set_weighting_scheme(Xapian::DLHWeight());
 458
 459     Xapian::MSet mset1;
 460     mset1 = enquire.get_mset(0, 10);
 461     TEST_EQUAL(mset1.size(), 3);
 462     mset_expect_order(mset1, 3, 1, 2);
 463     // Weights calculated manually using stats from the database.
 464     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 1.0046477754371292362);
 465     TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.97621929514640352757);
 466     // The following weight would be negative but gets clamped to 0.
 467     TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.0);
 468 }
 469
 470 static void
 471 gen_wdf_eq_doclen_db(Xapian::WritableDatabase& db, const string&)
 472 {
 473     Xapian::Document doc;
 474     doc.add_term("solo", 37);
 475     db.add_document(doc);
 476 }
 477
 478 // Test wdf == doclen.
 479 DEFINE_TESTCASE(dlhweight3, backend) {
 480     Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
 481     Xapian::Enquire enquire(db);
 482     Xapian::Query query("solo");
 483
 484     enquire.set_query(query);
 485     enquire.set_weighting_scheme(Xapian::DLHWeight());
 486
 487     Xapian::MSet mset1;
 488     mset1 = enquire.get_mset(0, 10);
 489     TEST_EQUAL(mset1.size(), 1);
 490     // Weight gets clamped to zero.
 491     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
 492 }
 493
 494 // Test for invalid values of c.
 495 DEFINE_TESTCASE(pl2weight2, !backend) {
 496     // InvalidArgumentError should be thrown if parameter c is invalid.
 497     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 498         Xapian::PL2Weight wt(-2.0));
 499 }
 500
 501 // Feature Test.
 502 DEFINE_TESTCASE(pl2weight3, backend) {
 503     Xapian::Database db = get_database("apitest_simpledata");
 504     Xapian::Enquire enquire(db);
 505     Xapian::Query query("paragraph");
 506     enquire.set_query(query);
 507     Xapian::MSet mset;
 508
 509     enquire.set_weighting_scheme(Xapian::PL2Weight(2.0));
 510     mset = enquire.get_mset(0, 10);
 511     TEST_EQUAL(mset.size(), 5);
 512     // Expected weight difference calculated in extended precision using stats
 513     // from the test database.
 514     TEST_EQUAL_DOUBLE(mset[2].get_weight(),
 515                       mset[3].get_weight() + 0.0086861771701328694);
 516 }
 517
 518 // Test for invalid values of parameters, c and delta.
 519 DEFINE_TESTCASE(pl2plusweight2, !backend) {
 520     // InvalidArgumentError should be thrown if parameter c is invalid.
 521     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 522         Xapian::PL2PlusWeight wt(-2.0, 0.9));
 523
 524     // InvalidArgumentError should be thrown if parameter delta is invalid.
 525     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 526         Xapian::PL2PlusWeight wt(1.0, -1.9));
 527 }
 528
 529 // Feature Test 1 for PL2PlusWeight.
 530 DEFINE_TESTCASE(pl2plusweight4, backend) {
 531     Xapian::Database db = get_database("apitest_simpledata");
 532     Xapian::Enquire enquire(db);
 533     enquire.set_query(Xapian::Query("to"));
 534     Xapian::MSet mset;
 535
 536     enquire.set_weighting_scheme(Xapian::PL2PlusWeight(2.0, 0.8));
 537     mset = enquire.get_mset(0, 10);
 538     TEST_EQUAL(mset.size(), 3);
 539     // Expected weight difference calculated in Python using stats from the
 540     // test database.
 541     TEST_EQUAL_DOUBLE(mset[1].get_weight(),
 542                       mset[2].get_weight() + 0.016760925252262027);
 543 }
 544
 545 // Feature Test 2 for PL2PlusWeight
 546 DEFINE_TESTCASE(pl2plusweight5, backend) {
 547     Xapian::Database db = get_database("apitest_simpledata");
 548     Xapian::Enquire enquire(db);
 549     Xapian::Query query("word");
 550     enquire.set_query(query);
 551     Xapian::MSet mset;
 552
 553     enquire.set_weighting_scheme(Xapian::PL2PlusWeight(1.0, 0.8));
 554     mset = enquire.get_mset(0, 10);
 555     // Expect MSet contains two documents having query "word".
 556     TEST_EQUAL(mset.size(), 2);
 557     // Expect Document 2 has higher weight than document 4 because
 558     // "word" appears more no. of times in document 2 than document 4.
 559     mset_expect_order(mset, 2, 4);
 560 }
 561
 562 // Feature test
 563 DEFINE_TESTCASE(dphweight1, backend) {
 564     Xapian::Database db = get_database("apitest_simpledata");
 565     Xapian::Enquire enquire(db);
 566     Xapian::Query query("paragraph");
 567
 568     enquire.set_query(query);
 569     enquire.set_weighting_scheme(Xapian::DPHWeight());
 570
 571     Xapian::MSet mset1;
 572     mset1 = enquire.get_mset(0, 10);
 573     TEST_EQUAL(mset1.size(), 5);
 574     /* The weight has been calculated manually by using the statistics of the
 575      * test database. */
 576     TEST_EQUAL_DOUBLE(mset1[2].get_weight() - mset1[4].get_weight(), 0.542623617687990167);
 577 }
 578
 579 // Test wdf == doclen.
 580 DEFINE_TESTCASE(dphweight3, backend) {
 581     Xapian::Database db = get_database("wdf_eq_doclen", gen_wdf_eq_doclen_db);
 582     Xapian::Enquire enquire(db);
 583     Xapian::Query query("solo");
 584
 585     enquire.set_query(query);
 586     enquire.set_weighting_scheme(Xapian::DPHWeight());
 587
 588     Xapian::MSet mset1;
 589     mset1 = enquire.get_mset(0, 10);
 590     TEST_EQUAL(mset1.size(), 1);
 591     // Weight gets clamped to zero.
 592     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
 593 }
 594
 595 // Test for various cases of normalization string.
 596 DEFINE_TESTCASE(tfidfweight1, !backend) {
 597     // InvalidArgumentError should be thrown if normalization string is invalid
 598     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 599         Xapian::TfIdfWeight b("JOHN_LENNON"));
 600
 601     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 602         Xapian::TfIdfWeight b("LOL"));
 603
 604     /* Normalization string should be set to "ntn" by constructor if none is
 605       given. */
 606     Xapian::TfIdfWeight weight2;
 607     TEST_EQUAL(weight2.serialise(), Xapian::TfIdfWeight("ntn").serialise());
 608
 609     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 610         Xapian::Weight::create("tfidf FUN NONE NONE"));
 611
 612     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 613         Xapian::Weight::create("tfidf NONE FUN NONE"));
 614
 615     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 616         Xapian::Weight::create("tfidf NONE NONE FUN"));
 617
 618     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 619         Xapian::Weight::create("tfidf NONE"));
 620
 621     TEST_EXCEPTION(Xapian::InvalidArgumentError,
 622         Xapian::Weight::create("tfidf NONE NONE"));
 623 }
 624
 625 // Feature tests for various normalization functions.
 626 DEFINE_TESTCASE(tfidfweight3, backend) {
 627     Xapian::Database db = get_database("apitest_simpledata");
 628     Xapian::Enquire enquire(db);
 629     Xapian::Query query("word");
 630     Xapian::MSet mset;
 631
 632     // Check for "ntn" when termfreq != N
 633     enquire.set_query(query);
 634     enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
 635     mset = enquire.get_mset(0, 10);
 636     TEST_EQUAL(mset.size(), 2);
 637     // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
 638     mset_expect_order(mset, 2, 4);
 639     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
 640
 641     // Check that wqf is taken into account.
 642     enquire.set_query(Xapian::Query("word", 2));
 643     enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
 644     Xapian::MSet mset2 = enquire.get_mset(0, 10);
 645     TEST_EQUAL(mset2.size(), 2);
 646     // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
 647     mset_expect_order(mset2, 2, 4);
 648     // wqf is 2, so weights should be doubled.
 649     TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
 650     TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
 651
 652     // check for "nfn" when termfreq != N
 653     enquire.set_query(query);
 654     enquire.set_weighting_scheme(Xapian::TfIdfWeight("nfn"));
 655     mset = enquire.get_mset(0, 10);
 656     TEST_EQUAL(mset.size(), 2);
 657     mset_expect_order(mset, 2, 4);
 658     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
 659
 660     // check for "nsn" when termfreq != N
 661     enquire.set_query(query);
 662     enquire.set_weighting_scheme(Xapian::TfIdfWeight("nsn"));
 663     mset = enquire.get_mset(0, 10);
 664     TEST_EQUAL(mset.size(), 2);
 665     mset_expect_order(mset, 2, 4);
 666     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
 667
 668     // Check for "bnn" and for both branches of 'b'.
 669     enquire.set_query(Xapian::Query("test"));
 670     enquire.set_weighting_scheme(Xapian::TfIdfWeight("bnn"));
 671     mset = enquire.get_mset(0, 10);
 672     TEST_EQUAL(mset.size(), 1);
 673     mset_expect_order(mset, 1);
 674     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
 675
 676     // Check for "lnn" and for both branches of 'l'.
 677     enquire.set_query(Xapian::Query("word"));
 678     enquire.set_weighting_scheme(Xapian::TfIdfWeight("lnn"));
 679     mset = enquire.get_mset(0, 10);
 680     TEST_EQUAL(mset.size(), 2);
 681     mset_expect_order(mset, 2, 4);
 682     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0)); // idfn=1 and so wt=tfn=1+log(tf)
 683     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0);         // idfn=1 and wt=tfn=1+log(tf)=1+log(1)=1
 684
 685     // Check for "snn"
 686     enquire.set_query(Xapian::Query("paragraph"));
 687     enquire.set_weighting_scheme(Xapian::TfIdfWeight("snn")); // idf=1 and tfn=tf*tf
 688     mset = enquire.get_mset(0, 10);
 689     TEST_EQUAL(mset.size(), 5);
 690     mset_expect_order(mset, 2, 1, 4, 3, 5);
 691     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
 692     TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
 693
 694     // Check for "ntn" when termfreq=N
 695     enquire.set_query(Xapian::Query("this"));  // N=termfreq and so idfn=0 for "t"
 696     enquire.set_weighting_scheme(Xapian::TfIdfWeight("ntn"));
 697     mset = enquire.get_mset(0, 10);
 698     TEST_EQUAL(mset.size(), 6);
 699     mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
 700     for (int i = 0; i < 6; ++i) {
 701         TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
 702     }
 703
 704     // Check for "npn" and for both branches of 'p'
 705     enquire.set_query(Xapian::Query("this"));  // N=termfreq and so idfn=0 for "p"
 706     enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
 707     mset = enquire.get_mset(0, 10);
 708     TEST_EQUAL(mset.size(), 6);
 709     mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
 710     for (int i = 0; i < 6; ++i) {
 711         TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
 712     }
 713
 714     // Check for "Lnn".
 715     enquire.set_query(Xapian::Query("word"));
 716     enquire.set_weighting_scheme(Xapian::TfIdfWeight("Lnn"));
 717     mset = enquire.get_mset(0, 10);
 718     TEST_EQUAL(mset.size(), 2);
 719     mset_expect_order(mset, 2, 4);
 720     TEST_EQUAL_DOUBLE(mset[0].get_weight(), (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
 721     TEST_EQUAL_DOUBLE(mset[1].get_weight(), (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
 722
 723     enquire.set_query(Xapian::Query("word"));
 724     enquire.set_weighting_scheme(Xapian::TfIdfWeight("npn"));
 725     mset = enquire.get_mset(0, 10);
 726     TEST_EQUAL(mset.size(), 2);
 727     mset_expect_order(mset, 2, 4);
 728     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
 729     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
 730
 731     // Check for "mnn".
 732     enquire.set_query(Xapian::Query("word"));
 733     enquire.set_weighting_scheme(Xapian::TfIdfWeight("mnn"));
 734     mset = enquire.get_mset(0, 10);
 735     TEST_EQUAL(mset.size(), 2);
 736     mset_expect_order(mset, 2, 4);
 737     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
 738     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
 739
 740     // Check for "ann".
 741     enquire.set_query(Xapian::Query("word"));
 742     enquire.set_weighting_scheme(Xapian::TfIdfWeight("ann"));
 743     mset = enquire.get_mset(0, 10);
 744     TEST_EQUAL(mset.size(), 2);
 745     mset_expect_order(mset, 2, 4);
 746     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
 747     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
 748
 749     // Check for NONE, TFIDF, NONE when termfreq != N
 750     enquire.set_query(query);
 751     enquire.set_weighting_scheme(
 752         Xapian::TfIdfWeight(
 753             Xapian::TfIdfWeight::wdf_norm::NONE,
 754             Xapian::TfIdfWeight::idf_norm::TFIDF,
 755             Xapian::TfIdfWeight::wt_norm::NONE));
 756     mset = enquire.get_mset(0, 10);
 757     TEST_EQUAL(mset.size(), 2);
 758     // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
 759     mset_expect_order(mset, 2, 4);
 760     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * log(6.0 / 2));
 761
 762     // Check that wqf is taken into account.
 763     enquire.set_query(Xapian::Query("word", 2));
 764     mset2 = enquire.get_mset(0, 10);
 765     TEST_EQUAL(mset2.size(), 2);
 766     // doc 2 should have higher weight than 4 as only tf(wdf) will dominate.
 767     mset_expect_order(mset2, 2, 4);
 768     // wqf is 2, so weights should be doubled.
 769     TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
 770     TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
 771
 772     // check for NONE, FREQ, NONE when termfreq != N
 773     enquire.set_query(query);
 774     enquire.set_weighting_scheme(
 775         Xapian::TfIdfWeight(
 776             Xapian::TfIdfWeight::wdf_norm::NONE,
 777             Xapian::TfIdfWeight::idf_norm::FREQ,
 778             Xapian::TfIdfWeight::wt_norm::NONE));
 779     mset = enquire.get_mset(0, 10);
 780     TEST_EQUAL(mset.size(), 2);
 781     mset_expect_order(mset, 2, 4);
 782     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 2);
 783
 784     // check for NONE, SQUARE, NONE when termfreq != N
 785     enquire.set_query(query);
 786     enquire.set_weighting_scheme(
 787         Xapian::TfIdfWeight(
 788             Xapian::TfIdfWeight::wdf_norm::NONE,
 789             Xapian::TfIdfWeight::idf_norm::SQUARE,
 790             Xapian::TfIdfWeight::wt_norm::NONE));
 791     mset = enquire.get_mset(0, 10);
 792     TEST_EQUAL(mset.size(), 2);
 793     mset_expect_order(mset, 2, 4);
 794     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 * pow(log(6.0 / 2), 2.0));
 795
 796     // Check for BOOLEAN, NONE, NONE and for both branches of BOOLEAN.
 797     enquire.set_query(Xapian::Query("test"));
 798     enquire.set_weighting_scheme(
 799         Xapian::TfIdfWeight(
 800             Xapian::TfIdfWeight::wdf_norm::BOOLEAN,
 801             Xapian::TfIdfWeight::idf_norm::NONE,
 802             Xapian::TfIdfWeight::wt_norm::NONE));
 803     mset = enquire.get_mset(0, 10);
 804     TEST_EQUAL(mset.size(), 1);
 805     mset_expect_order(mset, 1);
 806     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1.0);
 807
 808     // Check for LOG, NONE, NONE and for both branches of LOG.
 809     enquire.set_query(Xapian::Query("word"));
 810     enquire.set_weighting_scheme(
 811         Xapian::TfIdfWeight(
 812             Xapian::TfIdfWeight::wdf_norm::LOG,
 813             Xapian::TfIdfWeight::idf_norm::NONE,
 814             Xapian::TfIdfWeight::wt_norm::NONE));
 815     mset = enquire.get_mset(0, 10);
 816     TEST_EQUAL(mset.size(), 2);
 817     mset_expect_order(mset, 2, 4);
 818     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 1 + log(8.0));
 819     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0);
 820
 821     // Check for SQUARE, NONE, NONE.
 822     enquire.set_query(Xapian::Query("paragraph"));
 823     enquire.set_weighting_scheme(
 824         Xapian::TfIdfWeight(
 825             Xapian::TfIdfWeight::wdf_norm::SQUARE,
 826             Xapian::TfIdfWeight::idf_norm::NONE,
 827             Xapian::TfIdfWeight::wt_norm::NONE)); // idf=1 and tfn=tf*tf
 828     mset = enquire.get_mset(0, 10);
 829     TEST_EQUAL(mset.size(), 5);
 830     mset_expect_order(mset, 2, 1, 4, 3, 5);
 831     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 9.0);
 832     TEST_EQUAL_DOUBLE(mset[4].get_weight(), 1.0);
 833
 834     // Check for NONE, TFIDF, NONE when termfreq=N
 835     enquire.set_query(Xapian::Query("this"));
 836     // N=termfreq and so idfn=0 for TFIDF
 837     enquire.set_weighting_scheme(
 838         Xapian::TfIdfWeight(
 839             Xapian::TfIdfWeight::wdf_norm::NONE,
 840             Xapian::TfIdfWeight::idf_norm::TFIDF,
 841             Xapian::TfIdfWeight::wt_norm::NONE));
 842     mset = enquire.get_mset(0, 10);
 843     TEST_EQUAL(mset.size(), 6);
 844     mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
 845     for (int i = 0; i < 6; ++i) {
 846         TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
 847     }
 848
 849     // Check for NONE, PROB, NONE and for both branches of PROB
 850     enquire.set_query(Xapian::Query("this"));
 851     // N=termfreq and so idfn=0 for PROB
 852     enquire.set_weighting_scheme(
 853         Xapian::TfIdfWeight(
 854             Xapian::TfIdfWeight::wdf_norm::NONE,
 855             Xapian::TfIdfWeight::idf_norm::PROB,
 856             Xapian::TfIdfWeight::wt_norm::NONE));
 857     mset = enquire.get_mset(0, 10);
 858     TEST_EQUAL(mset.size(), 6);
 859     mset_expect_order(mset, 1, 2, 3, 4, 5, 6);
 860     for (int i = 0; i < 6; ++i) {
 861         TEST_EQUAL_DOUBLE(mset[i].get_weight(), 0.0);
 862     }
 863
 864     enquire.set_query(Xapian::Query("word"));
 865     enquire.set_weighting_scheme(
 866         Xapian::TfIdfWeight(
 867             Xapian::TfIdfWeight::wdf_norm::NONE,
 868             Xapian::TfIdfWeight::idf_norm::PROB,
 869             Xapian::TfIdfWeight::wt_norm::NONE));
 870     mset = enquire.get_mset(0, 10);
 871     TEST_EQUAL(mset.size(), 2);
 872     mset_expect_order(mset, 2, 4);
 873     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log((6.0 - 2) / 2));
 874     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log((6.0 - 2) / 2));
 875
 876     // Check for LOG_AVERAGE, NONE, NONE.
 877     enquire.set_query(Xapian::Query("word"));
 878     enquire.set_weighting_scheme(
 879         Xapian::TfIdfWeight(
 880             Xapian::TfIdfWeight::wdf_norm::LOG_AVERAGE,
 881             Xapian::TfIdfWeight::idf_norm::NONE,
 882             Xapian::TfIdfWeight::wt_norm::NONE));
 883     mset = enquire.get_mset(0, 10);
 884     TEST_EQUAL(mset.size(), 2);
 885     mset_expect_order(mset, 2, 4);
 886     TEST_EQUAL_DOUBLE(mset[0].get_weight(),
 887                       (1 + log(8.0)) / (1 + log(81.0 / 56.0)));
 888     TEST_EQUAL_DOUBLE(mset[1].get_weight(),
 889                       (1 + log(1.0)) / (1 + log(31.0 / 26.0)));
 890
 891     // Check for AUG_LOG, NONE, NONE.
 892     enquire.set_weighting_scheme(
 893         Xapian::TfIdfWeight(
 894             Xapian::TfIdfWeight::wdf_norm::AUG_LOG,
 895             Xapian::TfIdfWeight::idf_norm::NONE,
 896             Xapian::TfIdfWeight::wt_norm::NONE));
 897     mset = enquire.get_mset(0, 10);
 898     TEST_EQUAL(mset.size(), 2);
 899     mset_expect_order(mset, 2, 4);
 900     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.2 + 0.8 * log(1.0 + 8));
 901     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.2 + 0.8 * log(1.0 + 1));
 902
 903     // Check for NONE, GLOBAL_FREQ, NONE.
 904     enquire.set_weighting_scheme(
 905         Xapian::TfIdfWeight(
 906             Xapian::TfIdfWeight::wdf_norm::NONE,
 907             Xapian::TfIdfWeight::idf_norm::GLOBAL_FREQ,
 908             Xapian::TfIdfWeight::wt_norm::NONE));
 909     mset = enquire.get_mset(0, 10);
 910     TEST_EQUAL(mset.size(), 2);
 911     mset_expect_order(mset, 2, 4);
 912     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2));
 913     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2));
 914
 915     // Check for SQRT, NONE, NONE.
 916     enquire.set_weighting_scheme(
 917         Xapian::TfIdfWeight(
 918             Xapian::TfIdfWeight::wdf_norm::SQRT,
 919             Xapian::TfIdfWeight::idf_norm::NONE,
 920             Xapian::TfIdfWeight::wt_norm::NONE));
 921     mset = enquire.get_mset(0, 10);
 922     TEST_EQUAL(mset.size(), 2);
 923     mset_expect_order(mset, 2, 4);
 924     TEST_EQUAL_DOUBLE(mset[0].get_weight(), sqrt(8 - 0.5) + 1);
 925     TEST_EQUAL_DOUBLE(mset[1].get_weight(), sqrt(1 - 0.5) + 1);
 926
 927     // Check for NONE, LOG_GLOBAL_FREQ, NONE.
 928     enquire.set_weighting_scheme(
 929         Xapian::TfIdfWeight(
 930             Xapian::TfIdfWeight::wdf_norm::NONE,
 931             Xapian::TfIdfWeight::idf_norm::LOG_GLOBAL_FREQ,
 932             Xapian::TfIdfWeight::wt_norm::NONE));
 933     mset = enquire.get_mset(0, 10);
 934     TEST_EQUAL(mset.size(), 2);
 935     mset_expect_order(mset, 2, 4);
 936     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * log(9.0 / 2 + 1));
 937     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * log(9.0 / 2 + 1));
 938
 939     // Check for NONE, INCREMENTED_GLOBAL_FREQ, NONE.
 940     enquire.set_weighting_scheme(
 941         Xapian::TfIdfWeight(
 942             Xapian::TfIdfWeight::wdf_norm::NONE,
 943             Xapian::TfIdfWeight::idf_norm::INCREMENTED_GLOBAL_FREQ,
 944             Xapian::TfIdfWeight::wt_norm::NONE));
 945     mset = enquire.get_mset(0, 10);
 946     TEST_EQUAL(mset.size(), 2);
 947     mset_expect_order(mset, 2, 4);
 948     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * (9.0 / 2 + 1));
 949     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * (9.0 / 2 + 1));
 950
 951     // Check for NONE, SQRT_GLOBAL_FREQ, NONE.
 952     enquire.set_weighting_scheme(
 953         Xapian::TfIdfWeight(
 954             Xapian::TfIdfWeight::wdf_norm::NONE,
 955             Xapian::TfIdfWeight::idf_norm::SQRT_GLOBAL_FREQ,
 956             Xapian::TfIdfWeight::wt_norm::NONE));
 957     mset = enquire.get_mset(0, 10);
 958     TEST_EQUAL(mset.size(), 2);
 959     mset_expect_order(mset, 2, 4);
 960     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8 * sqrt(9.0 / 2 - 0.9));
 961     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1 * sqrt(9.0 / 2 - 0.9));
 962
 963     // Check for AUG_AVERAGE, NONE, NONE.
 964     enquire.set_weighting_scheme(
 965         Xapian::TfIdfWeight(
 966             Xapian::TfIdfWeight::wdf_norm::AUG_AVERAGE,
 967             Xapian::TfIdfWeight::idf_norm::NONE,
 968             Xapian::TfIdfWeight::wt_norm::NONE));
 969     mset = enquire.get_mset(0, 10);
 970     TEST_EQUAL(mset.size(), 2);
 971     mset_expect_order(mset, 2, 4);
 972     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.9 + 0.1 * (8.0 / (81.0 / 56.0)));
 973     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.9 + 0.1 * (1.0 / (31.0 / 26.0)));
 974
 975     // Check for MAX, NONE, NONE.
 976     enquire.set_weighting_scheme(
 977         Xapian::TfIdfWeight(
 978             Xapian::TfIdfWeight::wdf_norm::MAX,
 979             Xapian::TfIdfWeight::idf_norm::NONE,
 980             Xapian::TfIdfWeight::wt_norm::NONE));
 981     mset = enquire.get_mset(0, 10);
 982     TEST_EQUAL(mset.size(), 2);
 983     mset_expect_order(mset, 2, 4);
 984     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 8.0 / 8);
 985     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 1.0 / 4);
 986
 987     // Check for AUG, NONE, NONE.
 988     enquire.set_weighting_scheme(
 989         Xapian::TfIdfWeight(
 990             Xapian::TfIdfWeight::wdf_norm::AUG,
 991             Xapian::TfIdfWeight::idf_norm::NONE,
 992             Xapian::TfIdfWeight::wt_norm::NONE));
 993     mset = enquire.get_mset(0, 10);
 994     TEST_EQUAL(mset.size(), 2);
 995     mset_expect_order(mset, 2, 4);
 996     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 0.5 + 0.5 * 8.0 / 8);
 997     TEST_EQUAL_DOUBLE(mset[1].get_weight(), 0.5 + 0.5 * 1.0 / 4);
 998 }
 999
1000 // Feature tests for pivoted normalization functions.
1001 DEFINE_TESTCASE(tfidfweight4, backend) {
1002     Xapian::Database db = get_database("apitest_simpledata");
1003     Xapian::Enquire enquire(db);
1004     Xapian::Query query("paragraph");
1005     Xapian::MSet mset;
1006
1007     // Check for "PPn" normalization string.
1008     enquire.set_query(query);
1009     enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1010     mset = enquire.get_mset(0, 10);
1011     TEST_EQUAL(mset.size(), 5);
1012     // Shorter docs should ranker higher if wqf is equal among all the docs.
1013     TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1014     TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1015
1016     // Check that wqf is taken into account.
1017     enquire.set_query(Xapian::Query("paragraph", 2));
1018     enquire.set_weighting_scheme(Xapian::TfIdfWeight("PPn", 0.2, 1.0));
1019     Xapian::MSet mset2 = enquire.get_mset(0, 10);
1020     TEST_EQUAL(mset2.size(), 5);
1021     // wqf is 2, so weights should be doubled.
1022     TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1023     TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1024
1025     // check for "nPn" which represents "xPx"
1026     enquire.set_query(Xapian::Query("word"));
1027     enquire.set_weighting_scheme(Xapian::TfIdfWeight("nPn", 0.2, 1.0));
1028     mset = enquire.get_mset(0, 10);
1029     TEST_EQUAL(mset.size(), 2);
1030     // Expect doc 2 with query "word" to have higher weight than doc 4.
1031     mset_expect_order(mset, 2, 4);
1032
1033     // check for "Ptn" which represents "Pxx"
1034     enquire.set_query(Xapian::Query("word"));
1035     enquire.set_weighting_scheme(Xapian::TfIdfWeight("Ptn", 0.2, 1.0));
1036     mset = enquire.get_mset(0, 10);
1037     TEST_EQUAL(mset.size(), 2);
1038     // Expect doc 2 with query "word" to have higher weight than doc 4.
1039     mset_expect_order(mset, 2, 4);
1040
1041     // Check for PIVOTED, PIVOTED, NONE normalization string.
1042     enquire.set_query(query);
1043     enquire.set_weighting_scheme(
1044         Xapian::TfIdfWeight(
1045             Xapian::TfIdfWeight::wdf_norm::PIVOTED,
1046             Xapian::TfIdfWeight::idf_norm::PIVOTED,
1047             Xapian::TfIdfWeight::wt_norm::NONE));
1048     mset = enquire.get_mset(0, 10);
1049     TEST_EQUAL(mset.size(), 5);
1050     // Shorter docs should ranker higher if wqf is equal among all the docs.
1051     TEST_REL(mset[0].get_weight(),>,mset[1].get_weight());
1052     TEST_REL(mset[2].get_weight(),>,mset[3].get_weight());
1053
1054     // Check that wqf is taken into account.
1055     enquire.set_query(Xapian::Query("paragraph", 2));
1056     mset2 = enquire.get_mset(0, 10);
1057     TEST_EQUAL(mset2.size(), 5);
1058     // wqf is 2, so weights should be doubled.
1059     TEST_EQUAL_DOUBLE(mset[0].get_weight() * 2, mset2[0].get_weight());
1060     TEST_EQUAL_DOUBLE(mset[1].get_weight() * 2, mset2[1].get_weight());
1061
1062     // check for NONE, PIVOTED, NONE
1063     enquire.set_query(Xapian::Query("word"));
1064     enquire.set_weighting_scheme(
1065         Xapian::TfIdfWeight(
1066             Xapian::TfIdfWeight::wdf_norm::NONE,
1067             Xapian::TfIdfWeight::idf_norm::PIVOTED,
1068             Xapian::TfIdfWeight::wt_norm::NONE));
1069     mset = enquire.get_mset(0, 10);
1070     TEST_EQUAL(mset.size(), 2);
1071     // Expect doc 2 with query "word" to have higher weight than doc 4.
1072     mset_expect_order(mset, 2, 4);
1073
1074     // check for PIVOTED, TFIDF, NONE
1075     enquire.set_query(Xapian::Query("word"));
1076     enquire.set_weighting_scheme(
1077         Xapian::TfIdfWeight(
1078             Xapian::TfIdfWeight::wdf_norm::PIVOTED,
1079             Xapian::TfIdfWeight::idf_norm::TFIDF,
1080             Xapian::TfIdfWeight::wt_norm::NONE));
1081     mset = enquire.get_mset(0, 10);
1082     TEST_EQUAL(mset.size(), 2);
1083     // Expect doc 2 with query "word" to have higher weight than doc 4.
1084     mset_expect_order(mset, 2, 4);
1085 }
1086
1087 // Check that create_from_parameters() creates the correct object.
1088 DEFINE_TESTCASE(tfidfweight5, !backend) {
1089     auto wt_ptr = Xapian::Weight::create("tfidf NONE TFIDF NONE");
1090     auto wt = Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::NONE,
1091                                   Xapian::TfIdfWeight::idf_norm::TFIDF,
1092                                   Xapian::TfIdfWeight::wt_norm::NONE);
1093     TEST_EQUAL(wt_ptr->serialise(), wt.serialise());
1094     delete wt_ptr;
1095
1096     auto wt_ptr2 = Xapian::Weight::create("tfidf SQRT PIVOTED NONE");
1097     auto wt2 = Xapian::TfIdfWeight(Xapian::TfIdfWeight::wdf_norm::SQRT,
1098                                    Xapian::TfIdfWeight::idf_norm::PIVOTED,
1099                                    Xapian::TfIdfWeight::wt_norm::NONE);
1100     TEST_EQUAL(wt_ptr2->serialise(), wt2.serialise());
1101     delete wt_ptr2;
1102 }
1103
1104 class CheckInitWeight : public Xapian::Weight {
1105   public:
1106     double factor;
1107
1108     unsigned & zero_inits, & non_zero_inits;
1109
1110     CheckInitWeight(unsigned &z, unsigned &n)
1111         : factor(-1.0), zero_inits(z), non_zero_inits(n) {
1112         need_stat(DOC_LENGTH);
1113     }
1114
1115     void init(double factor_) override {
1116         factor = factor_;
1117         if (factor == 0.0)
1118             ++zero_inits;
1119         else
1120             ++non_zero_inits;
1121     }
1122
1123     Weight* clone() const override {
1124         return new CheckInitWeight(zero_inits, non_zero_inits);
1125     }
1126
1127     double get_sumpart(Xapian::termcount, Xapian::termcount,
1128                        Xapian::termcount, Xapian::termcount) const override {
1129         return 1.0;
1130     }
1131
1132     double get_maxpart() const override { return 1.0; }
1133
1134     double get_sumextra(Xapian::termcount doclen,
1135                         Xapian::termcount,
1136                         Xapian::termcount) const override {
1137         return 1.0 / doclen;
1138     }
1139
1140     double get_maxextra() const override { return 1.0; }
1141 };
1142
1143 /// Regression test - check init() is called for the term-indep Weight obj.
1144 DEFINE_TESTCASE(checkinitweight1, backend && !multi && !remote) {
1145     Xapian::Database db = get_database("apitest_simpledata");
1146     Xapian::Enquire enquire(db);
1147     Xapian::Query q(Xapian::Query::OP_AND,
1148                     Xapian::Query("this"), Xapian::Query("paragraph"));
1149     enquire.set_query(q);
1150     unsigned zero_inits = 0, non_zero_inits = 0;
1151     CheckInitWeight wt(zero_inits, non_zero_inits);
1152     enquire.set_weighting_scheme(wt);
1153     Xapian::MSet mset = enquire.get_mset(0, 3);
1154     TEST_EQUAL(zero_inits, 1);
1155     TEST_EQUAL(non_zero_inits, 2);
1156 }
1157
1158 class CheckStatsWeight : public Xapian::Weight {
1159   public:
1160     double factor = -1.0;
1161
1162     Xapian::Database db;
1163
1164     string term1;
1165
1166     // When testing OP_SYNONYM, term2 is also set.
1167     // When testing OP_WILDCARD, term2 == "*".
1168     // When testing a repeated term, term2 == "=" for the first occurrence and
1169     // "_" for subsequent occurrences.
1170     mutable string term2;
1171
1172     Xapian::termcount & sum;
1173     Xapian::termcount & sum_squares;
1174
1175     mutable Xapian::termcount len_upper = 0;
1176     mutable Xapian::termcount len_lower = Xapian::termcount(-1);
1177     mutable Xapian::termcount uniqueterms_upper = 0;
1178     mutable Xapian::termcount uniqueterms_lower = Xapian::termcount(-1);
1179     mutable Xapian::termcount wdf_upper = 0;
1180
1181     CheckStatsWeight(const Xapian::Database & db_,
1182                      const string & term1_,
1183                      const string & term2_,
1184                      Xapian::termcount & sum_,
1185                      Xapian::termcount & sum_squares_)
1186         : db(db_), term1(term1_), term2(term2_),
1187           sum(sum_), sum_squares(sum_squares_)
1188     {
1189         need_stat(COLLECTION_SIZE);
1190         need_stat(RSET_SIZE);
1191         need_stat(AVERAGE_LENGTH);
1192         need_stat(TERMFREQ);
1193         need_stat(RELTERMFREQ);
1194         need_stat(QUERY_LENGTH);
1195         need_stat(WQF);
1196         need_stat(WDF);
1197         need_stat(DOC_LENGTH);
1198         need_stat(DOC_LENGTH_MIN);
1199         need_stat(DOC_LENGTH_MAX);
1200         need_stat(DB_DOC_LENGTH_MIN);
1201         need_stat(DB_DOC_LENGTH_MAX);
1202         need_stat(WDF_MAX);
1203         need_stat(COLLECTION_FREQ);
1204         need_stat(UNIQUE_TERMS);
1205         need_stat(UNIQUE_TERMS_MIN);
1206         need_stat(UNIQUE_TERMS_MAX);
1207         need_stat(DB_UNIQUE_TERMS_MIN);
1208         need_stat(DB_UNIQUE_TERMS_MAX);
1209         need_stat(TOTAL_LENGTH);
1210         need_stat(WDF_DOC_MAX);
1211     }
1212
1213     CheckStatsWeight(const Xapian::Database & db_,
1214                      const string & term_,
1215                      Xapian::termcount & sum_,
1216                      Xapian::termcount & sum_squares_)
1217         : CheckStatsWeight(db_, term_, string(), sum_, sum_squares_) { }
1218
1219     void init(double factor_) override {
1220         factor = factor_;
1221     }
1222
1223     Weight* clone() const override {
1224         auto res = new CheckStatsWeight(db, term1, term2, sum, sum_squares);
1225         if (term2 == "=") {
1226             // The object passed to Enquire::set_weighting_scheme() is cloned
1227             // right away, and then cloned again for each term, and then
1228             // potentially once more for the term-independent weight
1229             // contribution.  In the repeated case, we want to handle the first
1230             // actual term specially, so we arrange for that to have "=" for
1231             // term2, and subsequent clones to have "_", so that we accumulate
1232             // sum and sum_squares on the first occurrence only.
1233             term2 = "_";
1234         }
1235         return res;
1236     }
1237
1238     double get_sumpart(Xapian::termcount wdf,
1239                        Xapian::termcount doclen,
1240                        Xapian::termcount uniqueterms,
1241                        Xapian::termcount wdfdocmax) const override {
1242         Xapian::doccount num_docs = db.get_doccount();
1243         TEST_EQUAL(get_collection_size(), num_docs);
1244         TEST_EQUAL(get_rset_size(), 0);
1245         TEST_EQUAL(get_average_length(), db.get_avlength());
1246         Xapian::totallength totlen = get_total_length();
1247         TEST_EQUAL(totlen, db.get_total_length());
1248         double total_term_occurences = get_average_length() * num_docs;
1249         TEST_EQUAL(Xapian::totallength(total_term_occurences + 0.5), totlen);
1250         if (term2.empty() || term2 == "=" || term2 == "_") {
1251             TEST_EQUAL(get_termfreq(), db.get_termfreq(term1));
1252             TEST_EQUAL(get_collection_freq(), db.get_collection_freq(term1));
1253             if (term2.empty()) {
1254                 TEST_EQUAL(get_query_length(), 1);
1255             } else {
1256                 TEST_EQUAL(get_query_length(), 2);
1257             }
1258         } else {
1259             Xapian::doccount tfmax = 0, tfsum = 0;
1260             Xapian::termcount cfmax = 0, cfsum = 0;
1261             if (term2 == "*") {
1262                 // OP_WILDCARD case.
1263                 for (auto&& t = db.allterms_begin(term1);
1264                      t != db.allterms_end(term1); ++t) {
1265                     Xapian::doccount tf = t.get_termfreq();
1266                     tout << "->" << *t << " " << tf << '\n';
1267                     tfsum += tf;
1268                     tfmax = max(tfmax, tf);
1269                     Xapian::termcount cf = db.get_collection_freq(*t);
1270                     cfsum += cf;
1271                     cfmax = max(cfmax, cf);
1272                 }
1273                 TEST_EQUAL(get_query_length(), 1);
1274             } else {
1275                 // OP_SYNONYM case.
1276                 Xapian::doccount tf1 = db.get_termfreq(term1);
1277                 Xapian::doccount tf2 = db.get_termfreq(term2);
1278                 tfsum = tf1 + tf2;
1279                 tfmax = max(tf1, tf2);
1280                 Xapian::termcount cf1 = db.get_collection_freq(term1);
1281                 Xapian::termcount cf2 = db.get_collection_freq(term2);
1282                 cfsum = cf1 + cf2;
1283                 cfmax = max(cf1, cf2);
1284                 TEST_EQUAL(get_query_length(), 2);
1285             }
1286             // Synonym occurs at least as many times as any term.
1287             TEST_REL(get_termfreq(), >=, tfmax);
1288             TEST_REL(get_collection_freq(), >=, cfmax);
1289             // Synonym can't occur more times than the terms do.
1290             TEST_REL(get_termfreq(), <=, tfsum);
1291             TEST_REL(get_collection_freq(), <=, cfsum);
1292             // Synonym can't occur more times than there are documents/terms.
1293             TEST_REL(get_termfreq(), <=, num_docs);
1294             TEST_REL(get_collection_freq(), <=, totlen);
1295         }
1296         TEST_EQUAL(get_reltermfreq(), 0);
1297         TEST_EQUAL(get_wqf(), 1);
1298         TEST_REL(doclen,>=,len_lower);
1299         TEST_REL(doclen,<=,len_upper);
1300         if (doclen > 0) {
1301             TEST_REL(uniqueterms,>=,1);
1302             TEST_REL(uniqueterms_lower,>=,1);
1303             TEST_REL(wdfdocmax,>=,1);
1304         }
1305         TEST_REL(uniqueterms,>=,uniqueterms_lower);
1306         TEST_REL(uniqueterms,<=,uniqueterms_upper);
1307         TEST_REL(uniqueterms,<=,doclen);
1308         TEST_REL(uniqueterms_upper,<=,len_upper);
1309         TEST_REL(wdf,<=,wdf_upper);
1310         TEST_REL(wdfdocmax,<=,doclen);
1311         TEST_REL(wdfdocmax,>=,wdf);
1312
1313         auto db_len_lower = db.get_doclength_lower_bound();
1314         auto db_len_upper = db.get_doclength_upper_bound();
1315         auto db_uniqueterms_lower = db.get_unique_terms_lower_bound();
1316         auto db_uniqueterms_upper = db.get_unique_terms_upper_bound();
1317         TEST_EQUAL(get_db_doclength_lower_bound(), db_len_lower);
1318         TEST_EQUAL(get_db_doclength_upper_bound(), db_len_upper);
1319         TEST_EQUAL(get_db_unique_terms_lower_bound(), db_uniqueterms_lower);
1320         TEST_EQUAL(get_db_unique_terms_upper_bound(), db_uniqueterms_upper);
1321         if (db.size() == 1) {
1322             TEST_EQUAL(len_lower, db_len_lower);
1323             TEST_EQUAL(len_upper, db_len_upper);
1324             TEST_EQUAL(uniqueterms_lower, db_uniqueterms_lower);
1325             TEST_EQUAL(uniqueterms_upper, db_uniqueterms_upper);
1326         } else {
1327             TEST_REL(len_lower,>=,db_len_lower);
1328             TEST_REL(len_upper,<=,db_len_upper);
1329             TEST_REL(uniqueterms_lower,>=,db_uniqueterms_lower);
1330             TEST_REL(uniqueterms_upper,<=,db_uniqueterms_upper);
1331         }
1332         if (term2 != "_") {
1333             sum += wdf;
1334             sum_squares += wdf * wdf;
1335         }
1336         return 1.0;
1337     }
1338
1339     double get_maxpart() const override {
1340         if (len_upper == 0) {
1341             len_lower = get_doclength_lower_bound();
1342             len_upper = get_doclength_upper_bound();
1343             uniqueterms_lower = get_unique_terms_lower_bound();
1344             uniqueterms_upper = get_unique_terms_upper_bound();
1345             wdf_upper = get_wdf_upper_bound();
1346         }
1347         return 1.0;
1348     }
1349
1350     double get_sumextra(Xapian::termcount doclen,
1351                         Xapian::termcount,
1352                         Xapian::termcount) const override {
1353         return 1.0 / doclen;
1354     }
1355
1356     double get_maxextra() const override { return 1.0; }
1357 };
1358
1359 /// Check the weight subclass gets the correct stats.
1360 DEFINE_TESTCASE(checkstatsweight1, backend && !remote) {
1361     Xapian::Database db = get_database("apitest_simpledata");
1362     Xapian::Enquire enquire(db);
1363     Xapian::TermIterator a;
1364     for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1365         const string & term = *a;
1366         enquire.set_query(Xapian::Query(term));
1367         Xapian::termcount sum = 0;
1368         Xapian::termcount sum_squares = 0;
1369         CheckStatsWeight wt(db, term, sum, sum_squares);
1370         enquire.set_weighting_scheme(wt);
1371         Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1372
1373         // The document order in the multi-db case isn't the same as the
1374         // postlist order on the combined DB, so it's hard to compare the
1375         // wdf for each document in the Weight objects, but we can sum
1376         // the wdfs and the squares of the wdfs which provides a decent
1377         // check that we're not getting the wrong wdf values (it ensures
1378         // they have the right mean and standard deviation).
1379         Xapian::termcount expected_sum = 0;
1380         Xapian::termcount expected_sum_squares = 0;
1381         Xapian::PostingIterator i;
1382         for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1383             Xapian::termcount wdf = i.get_wdf();
1384             expected_sum += wdf;
1385             expected_sum_squares += wdf * wdf;
1386         }
1387         TEST_EQUAL(sum, expected_sum);
1388         TEST_EQUAL(sum_squares, expected_sum_squares);
1389     }
1390 }
1391
1392 /// Check the weight subclass gets the correct stats with OP_SYNONYM.
1393 // Regression test for bugs fixed in 1.4.1.
1394 DEFINE_TESTCASE(checkstatsweight2, backend && !remote) {
1395     Xapian::Database db = get_database("apitest_simpledata");
1396     Xapian::Enquire enquire(db);
1397     Xapian::TermIterator a;
1398     for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1399         const string & term1 = *a;
1400         if (++a == db.allterms_end()) break;
1401         const string & term2 = *a;
1402         Xapian::Query q(Xapian::Query::OP_SYNONYM,
1403                         Xapian::Query(term1), Xapian::Query(term2));
1404         tout << q.get_description() << '\n';
1405         enquire.set_query(q);
1406         Xapian::termcount sum = 0;
1407         Xapian::termcount sum_squares = 0;
1408         CheckStatsWeight wt(db, term1, term2, sum, sum_squares);
1409         enquire.set_weighting_scheme(wt);
1410         Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1411
1412         // The document order in the multi-db case isn't the same as the
1413         // postlist order on the combined DB, so it's hard to compare the
1414         // wdf for each document in the Weight objects, but we can sum
1415         // the wdfs and the squares of the wdfs which provides a decent
1416         // check that we're not getting the wrong wdf values (it ensures
1417         // they have the right mean and standard deviation).
1418         Xapian::termcount expected_sum = 0;
1419         Xapian::termcount expected_sum_squares = 0;
1420         Xapian::PostingIterator i = db.postlist_begin(term1);
1421         Xapian::PostingIterator j = db.postlist_begin(term2);
1422         Xapian::docid did1 = *i, did2 = *j;
1423         while (true) {
1424             // To calculate expected_sum_squares correctly we need to square
1425             // the sum per document.
1426             Xapian::termcount wdf;
1427             if (did1 == did2) {
1428                 wdf = i.get_wdf() + j.get_wdf();
1429                 did1 = did2 = 0;
1430             } else if (did1 < did2) {
1431                 wdf = i.get_wdf();
1432                 did1 = 0;
1433             } else {
1434                 wdf = j.get_wdf();
1435                 did2 = 0;
1436             }
1437             expected_sum += wdf;
1438             expected_sum_squares += wdf * wdf;
1439
1440             if (did1 == 0) {
1441                 if (++i != db.postlist_end(term1)) {
1442                     did1 = *i;
1443                 } else {
1444                     if (did2 == Xapian::docid(-1)) break;
1445                     did1 = Xapian::docid(-1);
1446                 }
1447             }
1448             if (did2 == 0) {
1449                 if (++j != db.postlist_end(term2)) {
1450                     did2 = *j;
1451                 } else {
1452                     if (did1 == Xapian::docid(-1)) break;
1453                     did2 = Xapian::docid(-1);
1454                 }
1455             }
1456         }
1457         // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1458         // the individual terms.
1459         TEST_EQUAL(sum, expected_sum);
1460         TEST_REL(sum_squares, >=, expected_sum_squares);
1461     }
1462 }
1463
1464 /// Check the weight subclass gets the correct stats with OP_WILDCARD.
1465 // Regression test for bug fixed in 1.4.1.
1466 DEFINE_TESTCASE(checkstatsweight3, backend && !remote) {
1467     // The most correct thing to do would be to collate termfreqs across shards
1468     // for this, but if that's too hard to do efficiently we could at least
1469     // scale up the termfreqs proportional to the size of the shard.
1470     XFAIL_FOR_BACKEND("multi", "OP_WILDCARD+OP_SYNONYM use shard termfreqs");
1471
1472     struct PlCmp {
1473         bool operator()(const Xapian::PostingIterator& a,
1474                         const Xapian::PostingIterator& b) {
1475             return *a < *b;
1476         }
1477     };
1478
1479     Xapian::Database db = get_database("apitest_simpledata");
1480     Xapian::Enquire enquire(db);
1481     Xapian::TermIterator a;
1482     static const char * const testcases[] = {
1483         "a", // a* matches all documents, but no term matches all.
1484         "pa", // Expands to only "paragraph", matching 5.
1485         "zulu", // No matches.
1486         "th", // Term "this" matches all documents.
1487     };
1488     for (auto pattern : testcases) {
1489         Xapian::Query q(Xapian::Query::OP_WILDCARD, pattern);
1490         tout << q.get_description() << '\n';
1491         enquire.set_query(q);
1492         Xapian::termcount sum = 0;
1493         Xapian::termcount sum_squares = 0;
1494         CheckStatsWeight wt(db, pattern, "*", sum, sum_squares);
1495         enquire.set_weighting_scheme(wt);
1496         Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1497
1498         // The document order in the multi-db case isn't the same as the
1499         // postlist order on the combined DB, so it's hard to compare the
1500         // wdf for each document in the Weight objects, but we can sum
1501         // the wdfs and the squares of the wdfs which provides a decent
1502         // check that we're not getting the wrong wdf values (it ensures
1503         // they have the right mean and standard deviation).
1504         Xapian::termcount expected_sum = 0;
1505         Xapian::termcount expected_sum_squares = 0;
1506         vector<Xapian::PostingIterator> postlists;
1507         for (auto&& t = db.allterms_begin(pattern);
1508              t != db.allterms_end(pattern); ++t) {
1509             postlists.emplace_back(db.postlist_begin(*t));
1510         }
1511         Heap::make(postlists.begin(), postlists.end(), PlCmp());
1512         Xapian::docid did = 0;
1513         Xapian::termcount wdf = 0;
1514         while (!postlists.empty()) {
1515             Xapian::docid did_new = *postlists.front();
1516             Xapian::termcount wdf_new = postlists.front().get_wdf();
1517             if (++(postlists.front()) == Xapian::PostingIterator()) {
1518                 Heap::pop(postlists.begin(), postlists.end(), PlCmp());
1519                 postlists.pop_back();
1520             } else {
1521                 Heap::replace(postlists.begin(), postlists.end(), PlCmp());
1522             }
1523             if (did_new != did) {
1524                 expected_sum += wdf;
1525                 expected_sum_squares += wdf * wdf;
1526                 wdf = 0;
1527                 did = did_new;
1528             }
1529             wdf += wdf_new;
1530         }
1531         expected_sum += wdf;
1532         expected_sum_squares += wdf * wdf;
1533         // The OP_SYNONYM's wdf should be equal to the sum of the wdfs of
1534         // the individual terms.
1535         TEST_EQUAL(sum, expected_sum);
1536         TEST_REL(sum_squares, >=, expected_sum_squares);
1537     }
1538 }
1539
1540 /// Check the stats for a repeated term are correct.
1541 // Regression test for bug fixed in 1.4.6.  Doesn't work with
1542 // multi as the weight object is cloned more times.
1543 DEFINE_TESTCASE(checkstatsweight4, backend && !remote && !multi) {
1544     Xapian::Database db = get_database("apitest_simpledata");
1545     Xapian::Enquire enquire(db);
1546     Xapian::TermIterator a;
1547     for (a = db.allterms_begin(); a != db.allterms_end(); ++a) {
1548         const string & term = *a;
1549         enquire.set_query(Xapian::Query(term, 1, 1) |
1550                           Xapian::Query(term, 1, 2));
1551         Xapian::termcount sum = 0;
1552         Xapian::termcount sum_squares = 0;
1553         CheckStatsWeight wt(db, term, "=", sum, sum_squares);
1554         enquire.set_weighting_scheme(wt);
1555         Xapian::MSet mset = enquire.get_mset(0, db.get_doccount());
1556
1557         // The document order in the multi-db case isn't the same as the
1558         // postlist order on the combined DB, so it's hard to compare the
1559         // wdf for each document in the Weight objects, but we can sum
1560         // the wdfs and the squares of the wdfs which provides a decent
1561         // check that we're not getting the wrong wdf values (it ensures
1562         // they have the right mean and standard deviation).
1563         Xapian::termcount expected_sum = 0;
1564         Xapian::termcount expected_sum_squares = 0;
1565         Xapian::PostingIterator i;
1566         for (i = db.postlist_begin(term); i != db.postlist_end(term); ++i) {
1567             Xapian::termcount wdf = i.get_wdf();
1568             expected_sum += wdf;
1569             expected_sum_squares += wdf * wdf;
1570         }
1571         TEST_EQUAL(sum, expected_sum);
1572         TEST_EQUAL(sum_squares, expected_sum_squares);
1573     }
1574 }
1575
1576 class CheckStatsWeight5 : public Xapian::Weight {
1577   public:
1578     mutable Xapian::docid did = 0;
1579
1580     double factor;
1581
1582     Xapian::Database db;
1583
1584     char stat_code;
1585
1586     explicit
1587     CheckStatsWeight5(const Xapian::Database& db_, char stat_code_ = '\0')
1588         : factor(-1.0), db(db_), stat_code(stat_code_)
1589     {
1590         switch (stat_code) {
1591             case 'w':
1592                 need_stat(WDF);
1593                 break;
1594             case 'd':
1595                 need_stat(DOC_LENGTH);
1596                 break;
1597         }
1598         need_stat(WDF_DOC_MAX);
1599     }
1600
1601     void init(double factor_) override {
1602         factor = factor_;
1603     }
1604
1605     Weight* clone() const override {
1606         return new CheckStatsWeight5(db, stat_code);
1607     }
1608
1609     double get_sumpart(Xapian::termcount,
1610                        Xapian::termcount,
1611                        Xapian::termcount,
1612                        Xapian::termcount wdfdocmax) const override {
1613         // The query is a synonym of all terms, so should match all documents.
1614         ++did;
1615         TEST_REL(wdfdocmax,==,db.get_doclength(did));
1616         return 1.0 / wdfdocmax;
1617     }
1618
1619     double get_maxpart() const override {
1620         return 1.0;
1621     }
1622 };
1623
1624 /// Check wdfdocmax is clamped to doclen even if wdf and doclen aren't wanted.
1625 DEFINE_TESTCASE(checkstatsweight5, backend && !multi && !remote) {
1626     Xapian::Database db = get_database("apitest_simpledata");
1627     Xapian::Enquire enquire(db);
1628     Xapian::Query q{Xapian::Query::OP_SYNONYM,
1629                     db.allterms_begin(),
1630                     db.allterms_end()};
1631     enquire.set_query(q);
1632     enquire.set_weighting_scheme(CheckStatsWeight5(db));
1633     Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
1634     enquire.set_weighting_scheme(CheckStatsWeight5(db, 'w'));
1635     Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
1636     enquire.set_weighting_scheme(CheckStatsWeight5(db, 'd'));
1637     Xapian::MSet mset3 = enquire.get_mset(0, db.get_doccount());
1638 }
1639
1640 // Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
1641 DEFINE_TESTCASE(unigramlmweight4, backend) {
1642     Xapian::Database db = get_database("apitest_simpledata");
1643     Xapian::Enquire enquire1(db);
1644     Xapian::Enquire enquire2(db);
1645     enquire1.set_query(Xapian::Query("paragraph"));
1646     Xapian::MSet mset1;
1647     enquire2.set_query(Xapian::Query("paragraph"));
1648     Xapian::MSet mset2;
1649     // 5 documents available with term paragraph so mset size should be 5
1650     enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING, 1, 0));
1651     enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 1, 0));
1652     mset1 = enquire1.get_mset(0, 10);
1653     mset2 = enquire2.get_mset(0, 10);
1654
1655     TEST_EQUAL(mset1.size(), 5);
1656     TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight());
1657 }
1658
1659 /* Test for checking if we don't use smoothing all
1660  * of them should give same result i.e wdf_double/len_double */
1661 DEFINE_TESTCASE(unigramlmweight5, backend) {
1662     Xapian::Database db = get_database("apitest_simpledata");
1663     Xapian::Enquire enquire1(db);
1664     Xapian::Enquire enquire2(db);
1665     Xapian::Enquire enquire3(db);
1666     Xapian::Enquire enquire4(db);
1667     enquire1.set_query(Xapian::Query("paragraph"));
1668     Xapian::MSet mset1;
1669     enquire2.set_query(Xapian::Query("paragraph"));
1670     Xapian::MSet mset2;
1671     enquire3.set_query(Xapian::Query("paragraph"));
1672     Xapian::MSet mset3;
1673     enquire4.set_query(Xapian::Query("paragraph"));
1674     Xapian::MSet mset4;
1675     // 5 documents available with term paragraph so mset size should be 5
1676     enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0, 0));
1677     enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0, 0));
1678     enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, 0, 0));
1679     enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING, 0, 0));
1680
1681     mset1 = enquire1.get_mset(0, 10);
1682     mset2 = enquire2.get_mset(0, 10);
1683     mset3 = enquire3.get_mset(0, 10);
1684     mset4 = enquire4.get_mset(0, 10);
1685
1686     TEST_EQUAL(mset1.size(), 5);
1687     TEST_EQUAL(mset2.size(), 5);
1688     TEST_EQUAL(mset3.size(), 5);
1689     TEST_EQUAL(mset4.size(), 5);
1690     for (Xapian::doccount i = 0; i < 5; ++i) {
1691         TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight());
1692         TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight());
1693         TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight());
1694         TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight());
1695         TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight());
1696         TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight());
1697     }
1698 }
1699
1700 // Feature test for Dir+ function.
1701 DEFINE_TESTCASE(unigramlmweight7, backend) {
1702     Xapian::Database db = get_database("apitest_simpledata");
1703     Xapian::Enquire enquire1(db);
1704     Xapian::Enquire enquire2(db);
1705     enquire1.set_query(Xapian::Query("paragraph"));
1706     enquire2.set_query(Xapian::Query("paragraph"));
1707     Xapian::MSet mset1;
1708     Xapian::MSet mset2;
1709
1710     enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1711     enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING, 2000, 0.05));
1712
1713     mset1 = enquire1.get_mset(0, 10);
1714     mset2 = enquire2.get_mset(0, 10);
1715
1716     // mset size should be 5
1717     TEST_EQUAL(mset1.size(), 5);
1718     TEST_EQUAL(mset2.size(), 5);
1719
1720     // Expect mset weights associated with Dir+ more than mset weights by Dir
1721     // because of the presence of extra weight component in Dir+ function.
1722     TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
1723     TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
1724     TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
1725     TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
1726     TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
1727 }
1728
1729 // Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
1730 DEFINE_TESTCASE(unigramlmweight8, backend) {
1731     Xapian::Database db = get_database("apitest_simpledata");
1732     Xapian::Enquire enquire(db);
1733     Xapian::Query query("paragraph");
1734
1735     enquire.set_query(query);
1736     enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1737
1738     Xapian::MSet mset1;
1739     mset1 = enquire.get_mset(0, 10);
1740     TEST_EQUAL(mset1.size(), 5);
1741
1742     enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
1743     enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
1744
1745     Xapian::MSet mset2;
1746     mset2 = enquire.get_mset(0, 10);
1747     TEST_EQUAL(mset2.size(), mset1.size());
1748     TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
1749     for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
1750         TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
1751     }
1752 }
1753
1754 // Feature test for CoordWeight.
1755 DEFINE_TESTCASE(coordweight1, backend) {
1756     Xapian::Enquire enquire(get_database("apitest_simpledata"));
1757     enquire.set_weighting_scheme(Xapian::CoordWeight());
1758     static const char * const terms[] = {
1759         "this", "line", "paragraph", "rubbish"
1760     };
1761     Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1762     enquire.set_query(query);
1763     Xapian::MSet mymset1 = enquire.get_mset(0, 100);
1764     // CoordWeight scores 1 for each matching term, so the weight should equal
1765     // the number of matching terms.
1766     for (Xapian::MSetIterator i = mymset1.begin(); i != mymset1.end(); ++i) {
1767         Xapian::termcount matching_terms = 0;
1768         Xapian::TermIterator t = enquire.get_matching_terms_begin(i);
1769         while (t != enquire.get_matching_terms_end(i)) {
1770             ++matching_terms;
1771             ++t;
1772         }
1773         TEST_EQUAL(i.get_weight(), matching_terms);
1774     }
1775 }
1776
1777 // Feature test.
1778 DEFINE_TESTCASE(dicecoeffweight2, backend) {
1779     Xapian::Database db = get_database("apitest_simpledata3");
1780     Xapian::Enquire enquire(db);
1781     static const char * const terms[] = {
1782         "one", "three"
1783     };
1784     Xapian::Query query(Xapian::Query::OP_OR, terms, std::end(terms));
1785     enquire.set_query(query);
1786     enquire.set_weighting_scheme(Xapian::DiceCoeffWeight());
1787
1788     Xapian::MSet mset1;
1789     mset1 = enquire.get_mset(0, 10);
1790     TEST_EQUAL(mset1.size(), 4);
1791
1792     /* The weight value has been manually calculated by using the statistics
1793      * of the test database. */
1794     TEST_EQUAL_DOUBLE(mset1[0].get_weight(), 0.571428571428571);
1795     TEST_EQUAL_DOUBLE(mset1[1].get_weight(), 0.5);
1796     TEST_EQUAL_DOUBLE(mset1[2].get_weight(), 0.2);
1797     TEST_EQUAL_DOUBLE(mset1[3].get_weight(), 0.181818181818182);
1798 }
1799
1800 // Test handling of a term with zero wdf.
1801 DEFINE_TESTCASE(dicecoeffweight3, backend) {
1802     Xapian::Database db = get_database("dicecoeffweight3",
1803                                        [](Xapian::WritableDatabase& wdb,
1804                                           const string&) {
1805                                            Xapian::Document doc;
1806                                            doc.add_term("radio", 2);
1807                                            doc.add_term("seahorse");
1808                                            doc.add_term("zebra");
1809                                            doc.add_boolean_term("false");
1810                                            doc.add_boolean_term("true");
1811                                            wdb.add_document(doc);
1812                                        });
1813     Xapian::Enquire enquire(db);
1814     enquire.set_weighting_scheme(Xapian::DiceCoeffWeight());
1815
1816     // OP_SYNONYM gives wdf zero is need_stat(WDF) isn't specified (and
1817     // it isn't by DiceCoeffWeight).
1818     Xapian::Query q(Xapian::Query::OP_SYNONYM,
1819                     Xapian::Query("false"), Xapian::Query("true"));
1820     enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT,
1821                                     q, 6.0), 2);
1822     Xapian::MSet mset = enquire.get_mset(0, 10);
1823     TEST_EQUAL(mset.size(), 1);
1824
1825     // factor * 2.0 * wqf / (query_length + unique_term_count)
1826     // = 6.0 * 2.0 * 1 / (2 + 4) = 2.0
1827     TEST_EQUAL_DOUBLE(mset[0].get_weight(), 2.0);
1828 }