From b97c122cf6f9d06fc8ace542735b0d9e9f18e9bc Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 20 Jun 2024 16:49:45 +1200 Subject: [PATCH] Reimplement Language Modelling weights LMWeight was using incorrect formulae for all the smoothing schemes. The replacement implementation separates each smoothing into a separate class (except that Dirichlet and Dir+ are combined into one class), since that seems to give a cleaner public API, and also simplifies the implementation. It also means we can now clearly say "don't use LMWeight" without risk of confusion. Fixes #779, reported by Sourav Saha. Closes https://github.com/xapian/xapian/pull/230 --- xapian-bindings/csharp/Makefile.am | 5 +- xapian-bindings/java/Makefile.am | 5 +- xapian-bindings/python3/pythontest.py | 5 +- xapian-core/api/registry.cc | 11 +- xapian-core/examples/quest.cc | 51 +- xapian-core/include/xapian/weight.h | 326 +++++++++--- xapian-core/tests/api_nodb.cc | 2 +- xapian-core/tests/api_weight.cc | 144 ++---- xapian-core/weight/lmweight.cc | 899 ++++++++++++++++++++++------------ xapian-core/weight/weightinternal.h | 1 + 10 files changed, 927 insertions(+), 522 deletions(-) rewrite xapian-core/weight/lmweight.cc (86%) diff --git a/xapian-bindings/csharp/Makefile.am b/xapian-bindings/csharp/Makefile.am index 5d4f07232..64a55450b 100644 --- a/xapian-bindings/csharp/Makefile.am +++ b/xapian-bindings/csharp/Makefile.am @@ -35,7 +35,10 @@ XAPIAN_SWIG_CS_SRCS=\ generated-csharp/IneB2Weight.cs \ generated-csharp/InL2Weight.cs \ generated-csharp/KeyMaker.cs \ - generated-csharp/LMWeight.cs \ + generated-csharp/LM2StageWeight.cs \ + generated-csharp/LMAbsDiscountWeight.cs \ + generated-csharp/LMDirichletWeight.cs \ + generated-csharp/LMJMWeight.cs \ generated-csharp/LatLongCoord.cs \ generated-csharp/LatLongCoords.cs \ generated-csharp/LatLongCoordsIterator.cs \ diff --git a/xapian-bindings/java/Makefile.am b/xapian-bindings/java/Makefile.am index a633d92e8..6534e26ca 100644 --- a/xapian-bindings/java/Makefile.am +++ b/xapian-bindings/java/Makefile.am @@ -59,7 +59,10 @@ XAPIAN_SWIG_JAVA_SRCS=\ org/xapian/IneB2Weight.java\ org/xapian/InL2Weight.java\ org/xapian/KeyMaker.java\ - org/xapian/LMWeight.java \ + org/xapian/LM2StageWeight.java\ + org/xapian/LMAbsDiscountWeight.java\ + org/xapian/LMDirichletWeight.java\ + org/xapian/LMJMWeight.java\ org/xapian/LatLongCoord.java\ org/xapian/LatLongCoords.java\ org/xapian/LatLongCoordsIterator.java\ diff --git a/xapian-bindings/python3/pythontest.py b/xapian-bindings/python3/pythontest.py index 3109096f2..d1524531e 100644 --- a/xapian-bindings/python3/pythontest.py +++ b/xapian-bindings/python3/pythontest.py @@ -1697,7 +1697,10 @@ def test_repr(): expect(repr(xapian.PL2Weight()) is None, False) expect(repr(xapian.PL2PlusWeight()) is None, False) expect(repr(xapian.DPHWeight()) is None, False) - expect(repr(xapian.LMWeight()) is None, False) + expect(repr(xapian.LM2StageWeight()) is None, False) + expect(repr(xapian.LMAbsDiscountWeight()) is None, False) + expect(repr(xapian.LMDirichletWeight()) is None, False) + expect(repr(xapian.LMJMWeight()) is None, False) expect(repr(xapian.CoordWeight()) is None, False) expect(repr(xapian.Compactor()) is None, False) expect(repr(xapian.ValuePostingSource(1)) is None, False) diff --git a/xapian-core/api/registry.cc b/xapian-core/api/registry.cc index b1801c510..0c20fa0b6 100644 --- a/xapian-core/api/registry.cc +++ b/xapian-core/api/registry.cc @@ -268,7 +268,16 @@ Registry::Internal::add_defaults() weighting_scheme = new Xapian::DPHWeight; wtschemes[weighting_scheme->name()] = weighting_scheme; wtschemes_short[weighting_scheme->short_name()] = weighting_scheme; - weighting_scheme = new Xapian::LMWeight; + weighting_scheme = new Xapian::LMJMWeight; + wtschemes[weighting_scheme->name()] = weighting_scheme; + wtschemes_short[weighting_scheme->short_name()] = weighting_scheme; + weighting_scheme = new Xapian::LMDirichletWeight; + wtschemes[weighting_scheme->name()] = weighting_scheme; + wtschemes_short[weighting_scheme->short_name()] = weighting_scheme; + weighting_scheme = new Xapian::LMAbsDiscountWeight; + wtschemes[weighting_scheme->name()] = weighting_scheme; + wtschemes_short[weighting_scheme->short_name()] = weighting_scheme; + weighting_scheme = new Xapian::LM2StageWeight; wtschemes[weighting_scheme->name()] = weighting_scheme; wtschemes_short[weighting_scheme->short_name()] = weighting_scheme; weighting_scheme = new Xapian::DiceCoeffWeight; diff --git a/xapian-core/examples/quest.cc b/xapian-core/examples/quest.cc index 9680799be..ddc0a006c 100644 --- a/xapian-core/examples/quest.cc +++ b/xapian-core/examples/quest.cc @@ -120,7 +120,10 @@ enum { WEIGHT_IFB2, WEIGHT_INEB2, WEIGHT_INL2, - WEIGHT_LM, + WEIGHT_LM2STAGE, + WEIGHT_LMABSDISCOUNT, + WEIGHT_LMDIRICHLET, + WEIGHT_LMJM, WEIGHT_PL2, WEIGHT_PL2PLUS, WEIGHT_TFIDF, @@ -128,21 +131,24 @@ enum { }; static const tab_entry wt_tab[] = { - { "bb2", WEIGHT_BB2 }, - { "bm25", WEIGHT_BM25 }, - { "bm25+", WEIGHT_BM25PLUS }, - { "bool", WEIGHT_BOOL }, - { "coord", WEIGHT_COORD }, - { "dlh", WEIGHT_DLH }, - { "dph", WEIGHT_DPH }, - { "ifb2", WEIGHT_IFB2 }, - { "ineb2", WEIGHT_INEB2 }, - { "inl2", WEIGHT_INL2 }, - { "lm", WEIGHT_LM }, - { "pl2", WEIGHT_PL2 }, - { "pl2+", WEIGHT_PL2PLUS }, - { "tfidf", WEIGHT_TFIDF }, - { "trad", WEIGHT_TRAD } + { "bb2", WEIGHT_BB2 }, + { "bm25", WEIGHT_BM25 }, + { "bm25+", WEIGHT_BM25PLUS }, + { "bool", WEIGHT_BOOL }, + { "coord", WEIGHT_COORD }, + { "dlh", WEIGHT_DLH }, + { "dph", WEIGHT_DPH }, + { "ifb2", WEIGHT_IFB2 }, + { "ineb2", WEIGHT_INEB2 }, + { "inl2", WEIGHT_INL2 }, + { "lm2stage", WEIGHT_LM2STAGE }, + { "lmabsdiscount", WEIGHT_LMABSDISCOUNT }, + { "lmdirichlet", WEIGHT_LMDIRICHLET }, + { "lmjm", WEIGHT_LMJM }, + { "pl2", WEIGHT_PL2 }, + { "pl2+", WEIGHT_PL2PLUS }, + { "tfidf", WEIGHT_TFIDF }, + { "trad", WEIGHT_TRAD } }; /** The number of spaces to indent by in print_table. @@ -398,8 +404,17 @@ try { case WEIGHT_INL2: enquire.set_weighting_scheme(Xapian::InL2Weight()); break; - case WEIGHT_LM: - enquire.set_weighting_scheme(Xapian::LMWeight()); + case WEIGHT_LM2STAGE: + enquire.set_weighting_scheme(Xapian::LM2StageWeight()); + break; + case WEIGHT_LMABSDISCOUNT: + enquire.set_weighting_scheme(Xapian::LMAbsDiscountWeight()); + break; + case WEIGHT_LMDIRICHLET: + enquire.set_weighting_scheme(Xapian::LMDirichletWeight()); + break; + case WEIGHT_LMJM: + enquire.set_weighting_scheme(Xapian::LMJMWeight()); break; case WEIGHT_PL2: enquire.set_weighting_scheme(Xapian::PL2Weight()); diff --git a/xapian-core/include/xapian/weight.h b/xapian-core/include/xapian/weight.h index 9877d67d2..f6055b732 100644 --- a/xapian-core/include/xapian/weight.h +++ b/xapian-core/include/xapian/weight.h @@ -250,18 +250,6 @@ class XAPIAN_VISIBILITY_DEFAULT Weight { /// Default constructor, needed by subclass constructors. Weight() : stats_needed() { } - /** Type of smoothing to use with the Language Model Weighting scheme. - * - * Default is TWO_STAGE_SMOOTHING. - */ - typedef enum { - TWO_STAGE_SMOOTHING = 1, - DIRICHLET_SMOOTHING = 2, - ABSOLUTE_DISCOUNT_SMOOTHING = 3, - JELINEK_MERCER_SMOOTHING = 4, - DIRICHLET_PLUS_SMOOTHING = 5 - } type_smoothing; - class Internal; /** Virtual destructor, because we have virtual methods. */ @@ -1827,112 +1815,288 @@ class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight { }; -/** Xapian::Weight subclass implementing the Language Model formula. +/** Language Model weighting with Jelinek-Mercer smoothing. * - * This class implements the "Language Model" Weighting scheme, as - * described by the early papers on LM by Bruce Croft. + * As described in: * - * LM works by comparing the query to a Language Model of the document. - * The language model itself is parameter-free, though LMWeight takes - * parameters which specify the smoothing used. + * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language + * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214. */ -class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight { +class XAPIAN_VISIBILITY_DEFAULT LMJMWeight : public Weight { /// The factor to multiply weights by. double factor; - /** The type of smoothing to use. */ - type_smoothing select_smoothing; - - // Parameters for handling negative value of log, and for smoothing. - double param_log, param_smoothing1, param_smoothing2; + /// Parameter controlling the smoothing. + double param_lambda; - // Collection weight. - double weight_collection; + /// Precalculated multiplier for use in weight calculations. + double multiplier; - LMWeight * clone() const; + LMJMWeight* clone() const; void init(double factor_); public: - /** Construct a LMWeight. - * - * @param param_log_ A non-negative parameter controlling how much - * to clamp negative values returned by the log. - * The log is calculated by multiplying the - * actual weight by param_log. If param_log is - * 0.0, then the document length upper bound will - * be used (default: document length upper bound) - * - * @param select_smoothing_ A parameter of type enum - * type_smoothing. This parameter - * controls which smoothing type to use. - * (default: TWO_STAGE_SMOOTHING) - * - * @param param_smoothing1_ A non-negative parameter for smoothing - * whose meaning depends on - * select_smoothing_. In - * JELINEK_MERCER_SMOOTHING, it plays the - * role of estimation and in - * DIRICHLET_SMOOTHING the role of query - * modelling. (default JELINEK_MERCER, - * ABSOLUTE, TWOSTAGE(0.7), - * DIRCHLET(2000)) - * - * @param param_smoothing2_ A non-negative parameter which is used - * with TWO_STAGE_SMOOTHING as parameter for Dirichlet's - * smoothing (default: 2000) and as parameter delta to - * control the scale of the tf lower bound in the - * DIRICHLET_PLUS_SMOOTHING (default 0.05). + /** Construct a LMJMWeight. * + * @param lambda A parameter strictly between 0 and 1 which linearly + * interpolates between the maximum likelihood model (the + * limit as λ→0) and the collection model (the limit as + * λ→1). + * + * Values of λ around 0.1 are apparently optimal for short + * queries and around 0.7 for long queries. If lambda is + * out of range (i.e. <= 0 or >= 1) then the λ value used + * is chosen dynamically based on the query length using + * the formula: + * + * (query_length - 1) / 10.0 + * + * The result is clamped to 0.1 for query_length <= 2, and + * to 0.7 for query_length >= 8. */ - // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing. - explicit LMWeight(double param_log_ = 0.0, - type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING, - double param_smoothing1_ = -1.0, - double param_smoothing2_ = -1.0) - : select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_), - param_smoothing2(param_smoothing2_) - { - if (param_smoothing1 < 0) param_smoothing1 = 0.7; - if (param_smoothing2 < 0) { - if (select_smoothing == TWO_STAGE_SMOOTHING) - param_smoothing2 = 2000.0; - else - param_smoothing2 = 0.05; - } + explicit LMJMWeight(double lambda = 0.0) : param_lambda(lambda) { + need_stat(QUERY_LENGTH); need_stat(DOC_LENGTH); - need_stat(RSET_SIZE); - need_stat(TERMFREQ); - need_stat(RELTERMFREQ); - need_stat(DOC_LENGTH_MAX); need_stat(WDF); need_stat(WDF_MAX); need_stat(COLLECTION_FREQ); need_stat(TOTAL_LENGTH); - if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) - need_stat(UNIQUE_TERMS); - if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) - need_stat(DOC_LENGTH_MIN); + need_stat(DOC_LENGTH_MIN); } + double get_sumpart(Xapian::termcount wdf, + Xapian::termcount doclen, + Xapian::termcount uniqterm, + Xapian::termcount wdfdocmax) const; + + double get_maxpart() const; + std::string name() const; std::string short_name() const; std::string serialise() const; - LMWeight * unserialise(const std::string & serialised) const; + LMJMWeight* unserialise(const std::string& serialised) const; + + LMJMWeight* create_from_parameters(const char* params) const; +}; + +/** Language Model weighting with Dirichlet or Dir+ smoothing. + * + * Dirichlet smoothing is as described in: + * + * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language + * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214. + * + * Dir+ is described in: + * + * Lv, Y., & Zhai, C. (2011). Lower-bounding term frequency normalization. + * International Conference on Information and Knowledge Management. + */ +class XAPIAN_VISIBILITY_DEFAULT LMDirichletWeight : public Weight { + /// The factor to multiply weights by. + double factor; + + /// Parameter controlling the smoothing. + double param_mu; + + /// A pseudo TF value to control the scale of the TF lower bound. + double param_delta; + + /// Precalculated multiplier for use in weight calculations. + double multiplier; + + /** Precalculated offset to add to every sumextra. + * + * This is needed because the formula can return a negative + * term-independent weight. + */ + double extra_offset; + + LMDirichletWeight* clone() const; + + void init(double factor_); + + public: + /** Construct a LMDirichletWeight. + * + * @param mu A parameter which is > 0. Default: 2000 + * @param delta A parameter which is >= 0, which is "a pseudo [wdf] + * value to control the scale of the [wdf] lower bound". + * If this parameter is > 0, then the smoothing is Dir+; + * if it's zero, it's Dirichlet. Default: 0.05 + */ + explicit LMDirichletWeight(double mu = 2000.0, double delta = 0.05) + : param_mu(mu), param_delta(delta) { + need_stat(QUERY_LENGTH); + need_stat(DOC_LENGTH); + need_stat(WDF); + need_stat(WDF_MAX); + need_stat(COLLECTION_FREQ); + need_stat(TOTAL_LENGTH); + need_stat(DOC_LENGTH_MIN); + need_stat(DOC_LENGTH_MAX); + } double get_sumpart(Xapian::termcount wdf, Xapian::termcount doclen, Xapian::termcount uniqterm, Xapian::termcount wdfdocmax) const; + + double get_maxpart() const; + + double get_sumextra(Xapian::termcount doclen, + Xapian::termcount, + Xapian::termcount) const; + + double get_maxextra() const; + + std::string name() const; + std::string short_name() const; + + std::string serialise() const; + LMDirichletWeight* unserialise(const std::string& serialised) const; + + LMDirichletWeight* create_from_parameters(const char* params) const; +}; + +/** Language Model weighting with Absolute Discount smoothing. + * + * As described in: + * + * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language + * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214. + */ +class XAPIAN_VISIBILITY_DEFAULT LMAbsDiscountWeight : public Weight { + /// The factor to multiply weights by. + double factor; + + /// Parameter controlling the smoothing. + double param_delta; + + /// Precalculated multiplier for use in weight calculations. + double multiplier; + + /** Precalculated offset to add to every sumextra. + * + * This is needed because the formula can return a negative + * term-independent weight. + */ + double extra_offset; + + LMAbsDiscountWeight* clone() const; + + void init(double factor_); + + public: + /** Construct a LMAbsDiscountWeight. + * + * @param delta A parameter between 0 and 1. Default: 0.7 + */ + explicit LMAbsDiscountWeight(double delta = 0.7) : param_delta(delta) { + need_stat(QUERY_LENGTH); + need_stat(DOC_LENGTH); + need_stat(WDF); + need_stat(WDF_MAX); + need_stat(COLLECTION_FREQ); + need_stat(TOTAL_LENGTH); + need_stat(DOC_LENGTH_MIN); + need_stat(UNIQUE_TERMS); + need_stat(DOC_LENGTH_MAX); + } + + double get_sumpart(Xapian::termcount wdf, + Xapian::termcount, + Xapian::termcount uniqterm, + Xapian::termcount wdfdocmax) const; + double get_maxpart() const; double get_sumextra(Xapian::termcount doclen, Xapian::termcount, Xapian::termcount) const; + double get_maxextra() const; - LMWeight * create_from_parameters(const char * params) const; + std::string name() const; + std::string short_name() const; + + std::string serialise() const; + LMAbsDiscountWeight* unserialise(const std::string& serialised) const; + + LMAbsDiscountWeight* create_from_parameters(const char* params) const; +}; + +/** Language Model weighting with Two Stage smoothing. + * + * As described in: + * + * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language + * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214. + */ +class XAPIAN_VISIBILITY_DEFAULT LM2StageWeight : public Weight { + /// The factor to multiply weights by. + double factor; + + /// Parameter controlling the smoothing. + double param_lambda; + + /// Parameter controlling the smoothing. + double param_mu; + + /// Precalculated multiplier for use in weight calculations. + double multiplier; + + /** Precalculated offset to add to every sumextra. + * + * This is needed because the formula can return a negative + * term-independent weight. + */ + double extra_offset; + + LM2StageWeight* clone() const; + + void init(double factor_); + + public: + /** Construct a LM2StageWeight. + * + * @param lambda A parameter between 0 and 1 which linearly interpolates + * between the maximum likelihood model (at 0) and the + * collection model (at 1). Default: 0.7 + * @param mu A parameter which is greater than 0. Default: 2000 + */ + explicit LM2StageWeight(double lambda = 0.7, double mu = 2000.0) + : param_lambda(lambda), param_mu(mu) + { + need_stat(QUERY_LENGTH); + need_stat(DOC_LENGTH); + need_stat(WDF); + need_stat(WDF_MAX); + need_stat(COLLECTION_FREQ); + need_stat(TOTAL_LENGTH); + need_stat(DOC_LENGTH_MIN); + need_stat(DOC_LENGTH_MAX); + } + + double get_sumpart(Xapian::termcount wdf, + Xapian::termcount doclen, + Xapian::termcount uniqterm, + Xapian::termcount wdfdocmax) const; + double get_maxpart() const; + + double get_sumextra(Xapian::termcount doclen, + Xapian::termcount uniqterm, + Xapian::termcount wdfdocmax) const; + double get_maxextra() const; + + std::string name() const; + std::string short_name() const; + + std::string serialise() const; + LM2StageWeight* unserialise(const std::string& serialised) const; + + LM2StageWeight* create_from_parameters(const char* params) const; }; /** Xapian::Weight subclass implementing Coordinate Matching. diff --git a/xapian-core/tests/api_nodb.cc b/xapian-core/tests/api_nodb.cc index 0863d29d2..70609d6e2 100644 --- a/xapian-core/tests/api_nodb.cc +++ b/xapian-core/tests/api_nodb.cc @@ -3,7 +3,7 @@ */ /* Copyright 1999,2000,2001 BrightStation PLC * Copyright 2002 Ananova Ltd - * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2015,2016,2017,2019 Olly Betts + * Copyright 2002-2024 Olly Betts * Copyright 2006 Lemur Consulting Ltd * * This program is free software; you can redistribute it and/or diff --git a/xapian-core/tests/api_weight.cc b/xapian-core/tests/api_weight.cc index d59fe0451..1f772691d 100644 --- a/xapian-core/tests/api_weight.cc +++ b/xapian-core/tests/api_weight.cc @@ -135,9 +135,14 @@ DEFINE_TESTCASE(weightserialisation1, !backend) { TEST_WEIGHT_CLASS(Xapian::PL2PlusWeight, (1.0, 0.8), (2.0, 0.9)); - TEST_WEIGHT_CLASS(Xapian::LMWeight, - (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0.7, 2000.0), - (0.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0.7)); + TEST_WEIGHT_CLASS(Xapian::LM2StageWeight, + (0.7, 2000.0), + (0.5, 2000.0)); + TEST_WEIGHT_CLASS(Xapian::LMAbsDiscountWeight, (0.7), (0.75)); + TEST_WEIGHT_CLASS(Xapian::LMDirichletWeight, + (2000.0, 0.05), + (2034.0, 0.0)); + TEST_WEIGHT_CLASS(Xapian::LMJMWeight, (0.0), (0.5)); } /// Basic test of using weighting schemes. @@ -176,9 +181,26 @@ DEFINE_TESTCASE(weight1, backend) { } Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches); TEST_EQUAL(mset_scaled.size(), expected_matches); + auto lm = name.find("::LM"); + // All the LM* schemes have sumextra except LMJMWeight. + // + // BM25 and BM25+ have sumextra, but by default k2 is 0 which means + // sumextra is zero too. + bool has_sumextra = lm != string::npos && name[lm + 4] != 'J'; for (Xapian::doccount i = 0; i < expected_matches; ++i) { - TEST_EQUAL_DOUBLE(mset_scaled[i].get_weight(), - mset[i].get_weight() * 15.0); + double w = mset[i].get_weight(); + double ws = mset_scaled[i].get_weight(); + if (has_sumextra) { + // sumextra is not scaled, so we can't test for (near) + // equality, but we can test that the weight is affected by the + // scaling, and that it's between the unscaled weight and the + // fully scaled weight. + TEST_NOT_EQUAL_DOUBLE(ws, w); + TEST_REL(ws, <=, w * 15.0); + TEST_REL(ws, >=, w); + } else { + TEST_EQUAL_DOUBLE(ws, w * 15.0); + } } }; @@ -201,7 +223,10 @@ DEFINE_TESTCASE(weight1, backend) { TEST_WEIGHTING_SCHEME(Xapian::BB2Weight); TEST_WEIGHTING_SCHEME(Xapian::PL2Weight); TEST_WEIGHTING_SCHEME(Xapian::PL2PlusWeight); - TEST_WEIGHTING_SCHEME(Xapian::LMWeight); + TEST_WEIGHTING_SCHEME(Xapian::LM2StageWeight); + TEST_WEIGHTING_SCHEME(Xapian::LMAbsDiscountWeight); + TEST_WEIGHTING_SCHEME(Xapian::LMDirichletWeight); + TEST_WEIGHTING_SCHEME(Xapian::LMJMWeight); // Regression test for bug fixed in 1.2.4. TEST_WEIGHTING_SCHEME(Xapian::BM25Weight, 0, 0, 0, 0, 1); /* As mentioned in the documentation, when parameter k is 0, wdf and @@ -1640,78 +1665,18 @@ DEFINE_TESTCASE(checkstatsweight5, backend && !multi && !remote) { Xapian::MSet mset3 = enquire.get_mset(0, db.get_doccount()); } -// Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both. -DEFINE_TESTCASE(unigramlmweight4, backend) { +// Feature test for Dir+ weighting. +DEFINE_TESTCASE(lmdirichletweight1, backend) { Xapian::Database db = get_database("apitest_simpledata"); Xapian::Enquire enquire1(db); Xapian::Enquire enquire2(db); enquire1.set_query(Xapian::Query("paragraph")); - Xapian::MSet mset1; enquire2.set_query(Xapian::Query("paragraph")); - Xapian::MSet mset2; - // 5 documents available with term paragraph so mset size should be 5 - enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING, 1, 0)); - enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 1, 0)); - mset1 = enquire1.get_mset(0, 10); - mset2 = enquire2.get_mset(0, 10); - - TEST_EQUAL(mset1.size(), 5); - TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight()); -} - -/* Test for checking if we don't use smoothing all - * of them should give same result i.e wdf_double/len_double */ -DEFINE_TESTCASE(unigramlmweight5, backend) { - Xapian::Database db = get_database("apitest_simpledata"); - Xapian::Enquire enquire1(db); - Xapian::Enquire enquire2(db); - Xapian::Enquire enquire3(db); - Xapian::Enquire enquire4(db); - enquire1.set_query(Xapian::Query("paragraph")); Xapian::MSet mset1; - enquire2.set_query(Xapian::Query("paragraph")); Xapian::MSet mset2; - enquire3.set_query(Xapian::Query("paragraph")); - Xapian::MSet mset3; - enquire4.set_query(Xapian::Query("paragraph")); - Xapian::MSet mset4; - // 5 documents available with term paragraph so mset size should be 5 - enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0, 0)); - enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0, 0)); - enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, 0, 0)); - enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING, 0, 0)); - - mset1 = enquire1.get_mset(0, 10); - mset2 = enquire2.get_mset(0, 10); - mset3 = enquire3.get_mset(0, 10); - mset4 = enquire4.get_mset(0, 10); - TEST_EQUAL(mset1.size(), 5); - TEST_EQUAL(mset2.size(), 5); - TEST_EQUAL(mset3.size(), 5); - TEST_EQUAL(mset4.size(), 5); - for (Xapian::doccount i = 0; i < 5; ++i) { - TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight()); - TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight()); - TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight()); - TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight()); - TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight()); - TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight()); - } -} - -// Feature test for Dir+ function. -DEFINE_TESTCASE(unigramlmweight7, backend) { - Xapian::Database db = get_database("apitest_simpledata"); - Xapian::Enquire enquire1(db); - Xapian::Enquire enquire2(db); - enquire1.set_query(Xapian::Query("paragraph")); - enquire2.set_query(Xapian::Query("paragraph")); - Xapian::MSet mset1; - Xapian::MSet mset2; - - enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0)); - enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING, 2000, 0.05)); + enquire1.set_weighting_scheme(Xapian::LMDirichletWeight(2000, 0)); + enquire2.set_weighting_scheme(Xapian::LMDirichletWeight(2000, 0.05)); mset1 = enquire1.get_mset(0, 10); mset2 = enquire2.get_mset(0, 10); @@ -1720,38 +1685,13 @@ DEFINE_TESTCASE(unigramlmweight7, backend) { TEST_EQUAL(mset1.size(), 5); TEST_EQUAL(mset2.size(), 5); - // Expect mset weights associated with Dir+ more than mset weights by Dir - // because of the presence of extra weight component in Dir+ function. - TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight()); - TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight()); - TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight()); - TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight()); - TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight()); -} - -// Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1). -DEFINE_TESTCASE(unigramlmweight8, backend) { - Xapian::Database db = get_database("apitest_simpledata"); - Xapian::Enquire enquire(db); - Xapian::Query query("paragraph"); - - enquire.set_query(query); - enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0)); - - Xapian::MSet mset1; - mset1 = enquire.get_mset(0, 10); - TEST_EQUAL(mset1.size(), 5); - - enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0)); - enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0)); - - Xapian::MSet mset2; - mset2 = enquire.get_mset(0, 10); - TEST_EQUAL(mset2.size(), mset1.size()); - TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0); - for (Xapian::doccount i = 0; i < mset1.size(); ++i) { - TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight()); - } + // Expect mset weights from Dir+ to be less than mset weights from + // Dirichlet for this testcase. + TEST_REL(mset2[0].get_weight(),<,mset1[0].get_weight()); + TEST_REL(mset2[1].get_weight(),<,mset1[1].get_weight()); + TEST_REL(mset2[2].get_weight(),<,mset1[2].get_weight()); + TEST_REL(mset2[3].get_weight(),<,mset1[3].get_weight()); + TEST_REL(mset2[4].get_weight(),<,mset1[4].get_weight()); } // Feature test for CoordWeight. diff --git a/xapian-core/weight/lmweight.cc b/xapian-core/weight/lmweight.cc dissimilarity index 86% index 44b258432..3648b244d 100644 --- a/xapian-core/weight/lmweight.cc +++ b/xapian-core/weight/lmweight.cc @@ -1,316 +1,583 @@ -/** @file - * @brief Xapian::LMWeight class - the Unigram Language Modelling formula. - */ -/* Copyright (C) 2012 Gaurav Arora - * Copyright (C) 2016,2019 Olly Betts - * Copyright (C) 2016 Vivek Pal - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "xapian/weight.h" -#include "weightinternal.h" - -#include "debuglog.h" -#include "omassert.h" -#include "serialise-double.h" -#include "stringutils.h" - -#include "xapian/error.h" - -#include - -using namespace std; - -namespace Xapian { - -LMWeight * -LMWeight::clone() const { - return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2); -} - -void -LMWeight::init(double factor_) -{ - factor = factor_; - - // Storing collection frequency of current term in collection_freq to be - // accessed while smoothing of weights for the term, for term not present - // in the document. - double collection_freq = get_collection_freq(); - - // Collection_freq of a term in collection should be always greater than or - // equal to zero (Non Negative). - AssertRel(collection_freq,>=,0); - LOGVALUE(WTCALC, collection_freq); - - // This is the total number of term occurrences in the collection, which we - // use for smoothing. - Xapian::totallength total_length = get_total_length(); - - /* In case the within document frequency of term is zero smoothing will - * be required and should be return instead of returning zero, as returning - * LM score are multiplication of contribution of all terms, due to absence - * of single term whole document is scored zero, hence apply collection - * frequency smoothing. - */ - if (rare(total_length == 0)) { - AssertEq(collection_freq, 0); - weight_collection = 0; - } else { - weight_collection = double(collection_freq) / total_length; - } - - // There can't be more relevant term in collection than total number of - // term. - AssertRel(weight_collection,<=,1.0); - - /* Setting default values of the param_log to handle negative value of log. - * It is considered to be upperbound of document length. - * initializing param_log to upperbound of document_length. - */ - - if (param_log == 0.0) { - param_log = get_doclength_upper_bound(); - } - - /* Since the optimal parameter for Jelinek mercer smoothing - * is based on query length, so if query is title query changing - * default value of smoothing parameter. - */ - - if (select_smoothing == JELINEK_MERCER_SMOOTHING || - select_smoothing == TWO_STAGE_SMOOTHING) { - if (param_smoothing1 == 0.7) { - if (get_query_length() <= 2) { - param_smoothing1 = 0.1; - } - } - } - - /* param_smoothing1 default value should be 2000 in case - * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1 - * if user supply his own value for param_smoothing1 value will not be set - * to 2000(default value) - */ - if (select_smoothing == DIRICHLET_SMOOTHING) { - if (param_smoothing1 == 0.7) { - param_smoothing1 = 2000; - } - } - - /* Setting param_smoothing1 and param_smoothing2 default value to used when - * DIRICHLET_PLUS_SMOOTHING is selected.*/ - if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) { - if (param_smoothing1 == 0.7) { - param_smoothing1 = 2000; - } - } -} - -string -LMWeight::name() const -{ - return "Xapian::LMWeight"; -} - -string -LMWeight::short_name() const -{ - return "lm"; -} - -string -LMWeight::serialise() const -{ - string result = serialise_double(param_log); - result += static_cast(select_smoothing); - result += serialise_double(param_smoothing1); - result += serialise_double(param_smoothing2); - return result; -} - -LMWeight * -LMWeight::unserialise(const string & s) const -{ - const char *ptr = s.data(); - const char *end = ptr + s.size(); - double param_log_ = unserialise_double(&ptr, end); - type_smoothing select_smoothing_ = static_cast(*(ptr)++); - double param_smoothing1_ = unserialise_double(&ptr, end); - double param_smoothing2_ = unserialise_double(&ptr, end); - if (rare(ptr != end)) - throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()"); - return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_); -} - -double -LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len, - Xapian::termcount uniqterm, Xapian::termcount) const -{ - // Within Document Frequency of the term in document being considered. - double wdf_double = wdf; - // Length of the Document in terms of number of terms. - double len_double = len; - // variable to store weight contribution of term in the document scoring for LM. - double weight_sum; - - // Calculating weights considering different smoothing option available to user. - if (select_smoothing == JELINEK_MERCER_SMOOTHING) { - /* Maximum likelihood of current term, weight contribution of term in - * case query term is present in the document. - */ - double weight_document = wdf_double / len_double; - weight_sum = (param_smoothing1 * weight_collection) + - ((1 - param_smoothing1) * weight_document); - } else if (select_smoothing == DIRICHLET_SMOOTHING) { - weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) / - (len_double + param_smoothing1); - } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) { - /* In the Dir+ weighting formula, sumpart weight contribution is :- - * - * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and - * log of (1 + (delta/param_smoothing1 * weight_collection))). - * Since, sum of logs is log of product so weight_sum is calculated as product - * of terms in log in the Dir+ formula. - */ - weight_sum = (1 + (wdf_double / (param_smoothing1 * weight_collection))) * - (1 + (param_smoothing2 / (param_smoothing1 * weight_collection))); - } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) { - double uniqterm_double = uniqterm; - weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double); - } else { - weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection)); - } - - /* Since LM score is calculated with multiplication, instead of changing - * the current implementation log trick have been used to calculate the - * product since (sum of log is log of product and since aim is ranking - * ranking document by product or log of product won't make a large - * difference hence log(product) will be used for ranking. - */ - double product = weight_sum * param_log; - return (product > 1.0) ? factor * log(product) : 0; -} - -double -LMWeight::get_maxpart() const -{ - // Variable to store the collection frequency - double upper_bound; - // Store upper bound on wdf in variable wdf_max - double wdf_max = get_wdf_upper_bound(); - - // Calculating upper bound considering different smoothing option available to user. - if (select_smoothing == JELINEK_MERCER_SMOOTHING) { - upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1); - } else if (select_smoothing == DIRICHLET_SMOOTHING) { - upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1); - } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) { - upper_bound = (1 + (wdf_max / (param_smoothing1 * weight_collection))) * - (1 + (param_smoothing2 / (param_smoothing1 * weight_collection))); - } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) { - upper_bound = param_smoothing1 * weight_collection + 1; - } else { - upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection)); - } - - /* Since weight are calculated using log trick, using same with the bounds. Refer - * comment in get_sumpart for the details. - */ - double product = upper_bound * param_log; - return (product > 1.0) ? factor * log(product) : 1.0; -} - -/* The extra weight component in the Dir+ formula is :- - * - * |Q| * log (param_smoothing1 / (|D| + param_smoothing1)) - * - * where, |Q| is total query length. - * |D| is total document length. - */ -double -LMWeight::get_sumextra(Xapian::termcount len, - Xapian::termcount, - Xapian::termcount) const -{ - if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) { - double extra_weight = param_smoothing1 / (len + param_smoothing1); - return get_query_length() * log(extra_weight); - } - return 0; -} - -double -LMWeight::get_maxextra() const -{ - if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) { - double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1); - return get_query_length() * log(extra_weight); - } - return 0; -} - -static bool -type_smoothing_param(const char ** p, Xapian::Weight::type_smoothing * ptr_val) -{ - const char* q = *p; - char ch = *q++; - if (ch < '1' || ch > '5' || C_isdigit(*q)) { - return false; - } - *p = q; - static const Xapian::Weight::type_smoothing smooth_tab[5] = { - Xapian::Weight::TWO_STAGE_SMOOTHING, - Xapian::Weight::DIRICHLET_SMOOTHING, - Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, - Xapian::Weight::JELINEK_MERCER_SMOOTHING, - Xapian::Weight::DIRICHLET_PLUS_SMOOTHING - }; - *ptr_val = smooth_tab[ch - '1']; - return true; -} - -static inline void -parameter_error(const char* message) -{ - Xapian::Weight::Internal::parameter_error(message, "lm"); -} - -LMWeight * -LMWeight::create_from_parameters(const char * p) const -{ - if (*p == '\0') - return new Xapian::LMWeight(); - double param_log_ = 0; - Xapian::Weight::type_smoothing type = Xapian::Weight::TWO_STAGE_SMOOTHING; - double smoothing1 = 0.7; - double smoothing2 = 2000; - if (!Xapian::Weight::Internal::double_param(&p, ¶m_log_)) - parameter_error("Parameter 1 (log) is invalid"); - if (*p && !type_smoothing_param(&p, &type)) - parameter_error("Parameter 2 (smoothing_type) is invalid"); - if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing1)) - parameter_error("Parameter 3 (smoothing1) is invalid"); - if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing2)) - parameter_error("Parameter 4 (smoothing2) is invalid"); - if (*p) - parameter_error("Extra data after parameter 4"); - return new Xapian::LMWeight(param_log_, type, smoothing1, smoothing2); -} - -} +/** @file + * @brief Unigram Language Modelling weighting classes + */ +/* Copyright (C) 2012 Gaurav Arora + * Copyright (C) 2016,2019,2024 Olly Betts + * Copyright (C) 2016 Vivek Pal + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "xapian/weight.h" +#include "weightinternal.h" + +#include "debuglog.h" +#include "omassert.h" +#include "serialise-double.h" +#include "stringutils.h" + +#include "xapian/error.h" + +#include + +using namespace std; + +[[noreturn]] +static inline void +parameter_error(const char* message, const std::string& scheme) +{ + Xapian::Weight::Internal::parameter_error(message, scheme); +} + +/* The equations for Jelinek-Mercer, Dirichlet, and Absolute Discount smoothing + * are taken from: + * + * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language + * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214. + * + * The equations for Two-Stage smoothing are also from this paper, but aren't + * given explicitly there, so we have derived them using the approach + * described in the paper. + * + * Dir+ comes from: + * + * Lv, Y., & Zhai, C. (2011). Lower-bounding term frequency normalization. + * International Conference on Information and Knowledge Management. + */ + +namespace Xapian { + +void +LMJMWeight::init(double factor_) +{ + factor = factor_ * get_wqf(); + + Xapian::totallength total_length = get_total_length(); + if (rare(total_length == 0)) { + // Avoid dividing by zero in the corner case where the database has no + // terms with wdf. + AssertEq(get_collection_freq(), 0); + multiplier = 0; + return; + } + + double lambda = param_lambda; + if (lambda <= 0.0 || lambda >= 1.0) { + auto query_len = get_query_length(); + if (query_len <= 2) { + lambda = 0.1; + } else if (query_len < 8) { + lambda = (query_len - 1) * 0.1; + } else { + lambda = 0.7; + } + } + + // Pre-calculate multiplier. + auto collection_freq = get_collection_freq(); + multiplier = (1.0 - lambda) * total_length / (lambda * collection_freq); +} + +double +LMJMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len, + Xapian::termcount, Xapian::termcount) const +{ + double w = multiplier * wdf / len; + return factor * log(1.0 + w); +} + +double +LMJMWeight::get_maxpart() const +{ + Xapian::termcount wdf_max = get_wdf_upper_bound(); + Xapian::termcount len_min = get_doclength_lower_bound(); + double w = multiplier; + if (wdf_max < len_min) { + // Clearly wdf / len <= wdf_max / len_min, but we also know that + // wdf <= len so wdf / len <= 1 so we use whichever bound is tighter. + w *= double(wdf_max) / len_min; + } + return factor * log(1.0 + w); +} + +LMJMWeight* +LMJMWeight::clone() const { + return new LMJMWeight(param_lambda); +} + +string +LMJMWeight::name() const +{ + return "Xapian::LMJMWeight"; +} + +string +LMJMWeight::short_name() const +{ + return "lmjm"; +} + +string +LMJMWeight::serialise() const +{ + return serialise_double(param_lambda); +} + +LMJMWeight* +LMJMWeight::unserialise(const string& s) const +{ + const char *ptr = s.data(); + const char *end = ptr + s.size(); + double lambda = unserialise_double(&ptr, end); + if (rare(ptr != end)) + throw Xapian::SerialisationError("Extra data in " + "LMJMWeight::unserialise()"); + return new LMJMWeight(lambda); +} + + +LMJMWeight* +LMJMWeight::create_from_parameters(const char* p) const +{ + double lambda = 0.0; + if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda)) + parameter_error("Parameter lambda is invalid", "lmjm"); + if (*p) + parameter_error("Extra data after parameter", "lmjm"); + return new Xapian::LMJMWeight(lambda); +} + +void +LMDirichletWeight::init(double factor_) +{ + factor = factor_ * get_wqf(); + + double mu = param_mu; + + auto doclen_max = get_doclength_upper_bound(); + extra_offset = get_query_length() * log(doclen_max + mu); + if (factor == 0.0) { + // This object is for the term-independent contribution. + return; + } + + Xapian::totallength total_length = get_total_length(); + if (rare(total_length == 0)) { + // Avoid dividing by zero in the corner case where the database has no + // terms with wdf. + AssertEq(get_collection_freq(), 0); + multiplier = 0; + return; + } + + // Pre-calculate the factor to multiply by. + multiplier = total_length / (get_collection_freq() * mu); + + double delta = param_delta; + if (delta != 0.0) { + // Include the query-independent Dir+ extra contribution in factor. + factor *= log(1.0 + delta * multiplier); + } +} + +double +LMDirichletWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount, + Xapian::termcount, Xapian::termcount) const +{ + return factor * log(1.0 + wdf * multiplier); +} + +double +LMDirichletWeight::get_maxpart() const +{ + Xapian::termcount wdf_max = get_wdf_upper_bound(); + return factor * log(1.0 + wdf_max * multiplier); +} + +double +LMDirichletWeight::get_sumextra(Xapian::termcount doclen, + Xapian::termcount, + Xapian::termcount) const +{ + // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent + // constant" which is: + // + // query_len * log(mu / (doclen + mu)) + // + // (The equation for Dir+ is the same.) + // + // Both mu and doclen are positive, so this value will be negative but + // get_sumextra() must return a non-negative value. To solve this we + // decompose this into: + // + // query_len * log(mu) - query_len * log(doclen + mu) + // + // The first part is constant for a given query so we ignore it. The second + // part is still negative, but we can add query_len * log(doclen_max + mu) + // (also constant for a given query) to address that, giving: + // + // query_len * log(doclen_max + mu) - query_len * log(doclen + mu) + // + // We pre-calculate the first part in init() and save it in member variable + // extra_weight. + return extra_offset - get_query_length() * log(doclen + param_mu); +} + +double +LMDirichletWeight::get_maxextra() const +{ + auto doclen_min = get_doclength_lower_bound(); + return extra_offset - get_query_length() * log(doclen_min + param_mu); +} + +LMDirichletWeight* +LMDirichletWeight::clone() const { + return new LMDirichletWeight(param_mu, param_delta); +} + +string +LMDirichletWeight::name() const +{ + return "Xapian::LMDirichletWeight"; +} + +string +LMDirichletWeight::short_name() const +{ + return "lmdirichlet"; +} + +string +LMDirichletWeight::serialise() const +{ + string result = serialise_double(param_mu); + result += serialise_double(param_delta); + return result; +} + +LMDirichletWeight* +LMDirichletWeight::unserialise(const string& s) const +{ + const char *ptr = s.data(); + const char *end = ptr + s.size(); + double mu = unserialise_double(&ptr, end); + double delta = unserialise_double(&ptr, end); + if (rare(ptr != end)) + throw Xapian::SerialisationError("Extra data in " + "LMDirichletWeight::unserialise()"); + return new LMDirichletWeight(mu, delta); +} + + +LMDirichletWeight* +LMDirichletWeight::create_from_parameters(const char* p) const +{ + double mu = 2000.0; + double delta = 0.05; + if (*p && !Xapian::Weight::Internal::double_param(&p, &mu)) + parameter_error("Parameter mu is invalid", "lmdirichlet"); + if (*p && !Xapian::Weight::Internal::double_param(&p, &delta)) + parameter_error("Parameter delta is invalid", "lmdirichlet"); + if (*p) + parameter_error("Extra data after parameters", "lmdirichlet"); + return new Xapian::LMDirichletWeight(mu, delta); +} + +void +LMAbsDiscountWeight::init(double factor_) +{ + factor = factor_ * get_wqf(); + + auto doclen_max = get_doclength_upper_bound(); + extra_offset = get_query_length() * log(double(doclen_max)); + + Xapian::totallength total_length = get_total_length(); + if (rare(total_length == 0)) { + // Avoid dividing by zero in the corner case where the database has no + // terms with wdf. + AssertEq(get_collection_freq(), 0); + multiplier = 0; + return; + } + + // Pre-calculate the factor to multiply by. + multiplier = total_length / (param_delta * get_collection_freq()); +} + +double +LMAbsDiscountWeight::get_sumpart(Xapian::termcount wdf, + Xapian::termcount, + Xapian::termcount uniqterms, + Xapian::termcount) const +{ + return factor * log(1.0 + (wdf - param_delta) / uniqterms * multiplier); +} + +double +LMAbsDiscountWeight::get_maxpart() const +{ + Xapian::termcount doclen_min = get_doclength_lower_bound(); + Xapian::termcount wdf_max = get_wdf_upper_bound(); + double x = (wdf_max - param_delta) * multiplier; + // We need a lower bound on uniqterms. We have doclen = sum(wdf) so: + // + // uniqterms >= ceil(doclen_min / wdf_max) + // + // We also know uniqterms >= 1 (since documents without terms won't match). + if (doclen_min > wdf_max) + x *= (doclen_min - 1) / wdf_max + 1; + return factor * log(1.0 + x); +} + +double +LMAbsDiscountWeight::get_sumextra(Xapian::termcount doclen, + Xapian::termcount uniqterms, + Xapian::termcount) const +{ + // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent + // constant" which is: + // + // query_len * log(delta * uniqterms / doclen) + // + // We know delta < 1 and uniqterms <= doclen so this value will be negative + // but get_sumextra() must return a non-negative value. To solve this we + // decompose this into: + // + // query_len * log(delta) + query_len * log(uniqterms / doclen) + // + // The first part is constant for a given query so we ignore it. The second + // part is still negative, but we can add query_len * log(1 / doclen_min) + // (also constant for a given query) to address that, giving: + // + // -query_len * log(1 / doclen_max) + query_len * log(uniqterms / doclen) + // = query_len * log(doclen_max) + query_len * log(uniqterms / doclen) + // + // We pre-calculate the first part in init() and save it in member variable + // extra_weight. + return extra_offset + get_query_length() * log(double(uniqterms) / doclen); +} + +double +LMAbsDiscountWeight::get_maxextra() const +{ + // Our best bound seems to be based on uniqterms <= doclen, which gives + // log(uniqterms / doclen> <= 0 + return extra_offset; +} + +LMAbsDiscountWeight* +LMAbsDiscountWeight::clone() const { + return new LMAbsDiscountWeight(param_delta); +} + +string +LMAbsDiscountWeight::name() const +{ + return "Xapian::LMAbsDiscountWeight"; +} + +string +LMAbsDiscountWeight::short_name() const +{ + return "lmabsdiscount"; +} + +string +LMAbsDiscountWeight::serialise() const +{ + return serialise_double(param_delta); +} + +LMAbsDiscountWeight* +LMAbsDiscountWeight::unserialise(const string& s) const +{ + const char *ptr = s.data(); + const char *end = ptr + s.size(); + double delta = unserialise_double(&ptr, end); + if (rare(ptr != end)) + throw Xapian::SerialisationError("Extra data in " + "LMAbsDiscountWeight::unserialise()"); + return new LMAbsDiscountWeight(delta); +} + + +LMAbsDiscountWeight* +LMAbsDiscountWeight::create_from_parameters(const char* p) const +{ + double delta = 0.7; + if (*p && !Xapian::Weight::Internal::double_param(&p, &delta)) + parameter_error("Parameter delta is invalid", "lmabsdiscount"); + if (*p) + parameter_error("Extra data after parameter", "lmabsdiscount"); + return new Xapian::LMAbsDiscountWeight(delta); +} + +void +LM2StageWeight::init(double factor_) +{ + factor = factor_ * get_wqf(); + + double lambda = param_lambda; + double mu = param_mu; + + auto doclen_max = get_doclength_upper_bound(); + extra_offset = -log((lambda * doclen_max + mu) / (doclen_max + mu)); + extra_offset *= get_query_length(); + + Xapian::totallength total_length = get_total_length(); + if (rare(total_length == 0)) { + // Avoid dividing by zero in the corner case where the database has no + // terms with wdf. + AssertEq(get_collection_freq(), 0); + multiplier = 0; + return; + } + + // Pre-calculate the factor to multiply by. + multiplier = (1 - lambda) * total_length / get_collection_freq(); +} + +double +LM2StageWeight::get_sumpart(Xapian::termcount wdf, + Xapian::termcount doclen, + Xapian::termcount, + Xapian::termcount) const +{ + // Termweight formula is: log{ 1 + (1-λ) c(w;d) / ( (λ|d|+μ) p(w|C) ) } + double lambda = param_lambda; + double mu = param_mu; + return factor * log(1.0 + wdf / (lambda * doclen + mu) * multiplier); +} + +double +LM2StageWeight::get_maxpart() const +{ + double lambda = param_lambda; + double mu = param_mu; + Xapian::termcount doclen_min = get_doclength_lower_bound(); + Xapian::termcount wdf_max = get_wdf_upper_bound(); + // We know wdf <= doclen so if the bounds don't rule out them being equal + // we want to find wdf value w to maximise (w / (lambda * w + mu)) which is + // just a case of maximising w, i.e. wdf_max. Otherwise we evaluate at + // wdf = wdf_max, doclen = doclen_min. + double x = wdf_max / (lambda * max(doclen_min, wdf_max) + mu); + return factor * log(1.0 + x * multiplier); +} + +double +LM2StageWeight::get_sumextra(Xapian::termcount doclen, + Xapian::termcount, + Xapian::termcount) const +{ + // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent + // constant" which is: + // + // query_len * log αd + // + // where αd = ( λ|d| + μ ) / ( |d| + μ ) for two-stage smoothing, so: + // + // query_len * log{ (lamba * doclen + mu) / (doclen + mu) } + // + // We know lambda < 1 so this will be negative but get_sumextra() must + // return a non-negative value. To achieve this we subtract: + // + // query_len * log{ (lamba * doclen_max + mu) / (doclen_max + mu) } + // + // We pre-calculate the negation of this and store it in extra_offset. + double lambda = param_lambda; + double mu = param_mu; + return extra_offset + + get_query_length() * log((lambda * doclen + mu) / (doclen + mu)); +} + +double +LM2StageWeight::get_maxextra() const +{ + // Our best bound is at the doclen value which maximises + // + // log((lambda * doclen + mu) / (doclen + mu)) + // + // which means maximising + // + // (lambda * doclen + mu) / (doclen + mu) + // + // Putting d' = doclen + mu we want to maximise + // + // lambda + (1 - lambda) * mu / d' + // + // (1 - lambda) > 0 so this is achieved by minimising d' which means + // minimising doclen. + double lambda = param_lambda; + double mu = param_mu; + auto doclen = get_doclength_lower_bound(); + return extra_offset + + get_query_length() * log((lambda * doclen + mu) / (doclen + mu)); +} + +LM2StageWeight* +LM2StageWeight::clone() const { + return new LM2StageWeight(param_lambda, param_mu); +} + +string +LM2StageWeight::name() const +{ + return "Xapian::LM2StageWeight"; +} + +string +LM2StageWeight::short_name() const +{ + return "lm2stage"; +} + +string +LM2StageWeight::serialise() const +{ + string result = serialise_double(param_lambda); + result += serialise_double(param_mu); + return result; +} + +LM2StageWeight * +LM2StageWeight::unserialise(const string & s) const +{ + const char *ptr = s.data(); + const char *end = ptr + s.size(); + double lambda = unserialise_double(&ptr, end); + double mu = unserialise_double(&ptr, end); + if (rare(ptr != end)) + throw Xapian::SerialisationError("Extra data in " + "LM2StageWeight::unserialise()"); + return new LM2StageWeight(lambda, mu); +} + +LM2StageWeight* +LM2StageWeight::create_from_parameters(const char* p) const +{ + double lambda = 0.7; + double mu = 2000.0; + if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda)) + parameter_error("Parameter lambda is invalid", "lm2stage"); + if (*p && !Xapian::Weight::Internal::double_param(&p, &mu)) + parameter_error("Parameter mu is invalid", "lm2stage"); + if (*p) + parameter_error("Extra data after parameters", "lm2stage"); + return new Xapian::LM2StageWeight(lambda, mu); +} + +} diff --git a/xapian-core/weight/weightinternal.h b/xapian-core/weight/weightinternal.h index b71fb2869..14ff54cae 100644 --- a/xapian-core/weight/weightinternal.h +++ b/xapian-core/weight/weightinternal.h @@ -315,6 +315,7 @@ class Weight::Internal { return true; } + [[noreturn]] static void parameter_error(const char * msg, const std::string & scheme) { std::string m(msg); -- 2.11.4.GIT