From b97c122cf6f9d06fc8ace542735b0d9e9f18e9bc Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 20 Jun 2024 16:49:45 +1200
Subject: [PATCH] Reimplement Language Modelling weights

LMWeight was using incorrect formulae for all the smoothing schemes.
The replacement implementation separates each smoothing into a separate
class (except that Dirichlet and Dir+ are combined into one class),
since that seems to give a cleaner public API, and also simplifies the
implementation.  It also means we can now clearly say "don't use
LMWeight" without risk of confusion.

Fixes #779, reported by Sourav Saha.
Closes https://github.com/xapian/xapian/pull/230
---
 xapian-bindings/csharp/Makefile.am    |   5 +-
 xapian-bindings/java/Makefile.am      |   5 +-
 xapian-bindings/python3/pythontest.py |   5 +-
 xapian-core/api/registry.cc           |  11 +-
 xapian-core/examples/quest.cc         |  51 +-
 xapian-core/include/xapian/weight.h   | 326 +++++++++---
 xapian-core/tests/api_nodb.cc         |   2 +-
 xapian-core/tests/api_weight.cc       | 144 ++----
 xapian-core/weight/lmweight.cc        | 899 ++++++++++++++++++++++------------
 xapian-core/weight/weightinternal.h   |   1 +
 10 files changed, 927 insertions(+), 522 deletions(-)
 rewrite xapian-core/weight/lmweight.cc (86%)

diff --git a/xapian-bindings/csharp/Makefile.am b/xapian-bindings/csharp/Makefile.am
index 5d4f07232..64a55450b 100644
--- a/xapian-bindings/csharp/Makefile.am
+++ b/xapian-bindings/csharp/Makefile.am
@@ -35,7 +35,10 @@ XAPIAN_SWIG_CS_SRCS=\
 	generated-csharp/IneB2Weight.cs \
 	generated-csharp/InL2Weight.cs \
 	generated-csharp/KeyMaker.cs \
-	generated-csharp/LMWeight.cs \
+	generated-csharp/LM2StageWeight.cs \
+	generated-csharp/LMAbsDiscountWeight.cs \
+	generated-csharp/LMDirichletWeight.cs \
+	generated-csharp/LMJMWeight.cs \
 	generated-csharp/LatLongCoord.cs \
 	generated-csharp/LatLongCoords.cs \
 	generated-csharp/LatLongCoordsIterator.cs \
diff --git a/xapian-bindings/java/Makefile.am b/xapian-bindings/java/Makefile.am
index a633d92e8..6534e26ca 100644
--- a/xapian-bindings/java/Makefile.am
+++ b/xapian-bindings/java/Makefile.am
@@ -59,7 +59,10 @@ XAPIAN_SWIG_JAVA_SRCS=\
 	org/xapian/IneB2Weight.java\
 	org/xapian/InL2Weight.java\
 	org/xapian/KeyMaker.java\
-	org/xapian/LMWeight.java \
+	org/xapian/LM2StageWeight.java\
+	org/xapian/LMAbsDiscountWeight.java\
+	org/xapian/LMDirichletWeight.java\
+	org/xapian/LMJMWeight.java\
 	org/xapian/LatLongCoord.java\
 	org/xapian/LatLongCoords.java\
 	org/xapian/LatLongCoordsIterator.java\
diff --git a/xapian-bindings/python3/pythontest.py b/xapian-bindings/python3/pythontest.py
index 3109096f2..d1524531e 100644
--- a/xapian-bindings/python3/pythontest.py
+++ b/xapian-bindings/python3/pythontest.py
@@ -1697,7 +1697,10 @@ def test_repr():
     expect(repr(xapian.PL2Weight()) is None, False)
     expect(repr(xapian.PL2PlusWeight()) is None, False)
     expect(repr(xapian.DPHWeight()) is None, False)
-    expect(repr(xapian.LMWeight()) is None, False)
+    expect(repr(xapian.LM2StageWeight()) is None, False)
+    expect(repr(xapian.LMAbsDiscountWeight()) is None, False)
+    expect(repr(xapian.LMDirichletWeight()) is None, False)
+    expect(repr(xapian.LMJMWeight()) is None, False)
     expect(repr(xapian.CoordWeight()) is None, False)
     expect(repr(xapian.Compactor()) is None, False)
     expect(repr(xapian.ValuePostingSource(1)) is None, False)
diff --git a/xapian-core/api/registry.cc b/xapian-core/api/registry.cc
index b1801c510..0c20fa0b6 100644
--- a/xapian-core/api/registry.cc
+++ b/xapian-core/api/registry.cc
@@ -268,7 +268,16 @@ Registry::Internal::add_defaults()
     weighting_scheme = new Xapian::DPHWeight;
     wtschemes[weighting_scheme->name()] = weighting_scheme;
     wtschemes_short[weighting_scheme->short_name()] = weighting_scheme;
-    weighting_scheme = new Xapian::LMWeight;
+    weighting_scheme = new Xapian::LMJMWeight;
+    wtschemes[weighting_scheme->name()] = weighting_scheme;
+    wtschemes_short[weighting_scheme->short_name()] = weighting_scheme;
+    weighting_scheme = new Xapian::LMDirichletWeight;
+    wtschemes[weighting_scheme->name()] = weighting_scheme;
+    wtschemes_short[weighting_scheme->short_name()] = weighting_scheme;
+    weighting_scheme = new Xapian::LMAbsDiscountWeight;
+    wtschemes[weighting_scheme->name()] = weighting_scheme;
+    wtschemes_short[weighting_scheme->short_name()] = weighting_scheme;
+    weighting_scheme = new Xapian::LM2StageWeight;
     wtschemes[weighting_scheme->name()] = weighting_scheme;
     wtschemes_short[weighting_scheme->short_name()] = weighting_scheme;
     weighting_scheme = new Xapian::DiceCoeffWeight;
diff --git a/xapian-core/examples/quest.cc b/xapian-core/examples/quest.cc
index 9680799be..ddc0a006c 100644
--- a/xapian-core/examples/quest.cc
+++ b/xapian-core/examples/quest.cc
@@ -120,7 +120,10 @@ enum {
     WEIGHT_IFB2,
     WEIGHT_INEB2,
     WEIGHT_INL2,
-    WEIGHT_LM,
+    WEIGHT_LM2STAGE,
+    WEIGHT_LMABSDISCOUNT,
+    WEIGHT_LMDIRICHLET,
+    WEIGHT_LMJM,
     WEIGHT_PL2,
     WEIGHT_PL2PLUS,
     WEIGHT_TFIDF,
@@ -128,21 +131,24 @@ enum {
 };
 
 static const tab_entry wt_tab[] = {
-    { "bb2",	WEIGHT_BB2 },
-    { "bm25",	WEIGHT_BM25 },
-    { "bm25+",	WEIGHT_BM25PLUS },
-    { "bool",	WEIGHT_BOOL },
-    { "coord",	WEIGHT_COORD },
-    { "dlh",	WEIGHT_DLH },
-    { "dph",	WEIGHT_DPH },
-    { "ifb2",	WEIGHT_IFB2 },
-    { "ineb2",	WEIGHT_INEB2 },
-    { "inl2",	WEIGHT_INL2 },
-    { "lm",	WEIGHT_LM },
-    { "pl2",	WEIGHT_PL2 },
-    { "pl2+",	WEIGHT_PL2PLUS },
-    { "tfidf",	WEIGHT_TFIDF },
-    { "trad",	WEIGHT_TRAD }
+    { "bb2",		WEIGHT_BB2 },
+    { "bm25",		WEIGHT_BM25 },
+    { "bm25+",		WEIGHT_BM25PLUS },
+    { "bool",		WEIGHT_BOOL },
+    { "coord",		WEIGHT_COORD },
+    { "dlh",		WEIGHT_DLH },
+    { "dph",		WEIGHT_DPH },
+    { "ifb2",		WEIGHT_IFB2 },
+    { "ineb2",		WEIGHT_INEB2 },
+    { "inl2",		WEIGHT_INL2 },
+    { "lm2stage",	WEIGHT_LM2STAGE },
+    { "lmabsdiscount",	WEIGHT_LMABSDISCOUNT },
+    { "lmdirichlet",	WEIGHT_LMDIRICHLET },
+    { "lmjm",		WEIGHT_LMJM },
+    { "pl2",		WEIGHT_PL2 },
+    { "pl2+",		WEIGHT_PL2PLUS },
+    { "tfidf",		WEIGHT_TFIDF },
+    { "trad",		WEIGHT_TRAD }
 };
 
 /** The number of spaces to indent by in print_table.
@@ -398,8 +404,17 @@ try {
 	case WEIGHT_INL2:
 	    enquire.set_weighting_scheme(Xapian::InL2Weight());
 	    break;
-	case WEIGHT_LM:
-	    enquire.set_weighting_scheme(Xapian::LMWeight());
+	case WEIGHT_LM2STAGE:
+	    enquire.set_weighting_scheme(Xapian::LM2StageWeight());
+	    break;
+	case WEIGHT_LMABSDISCOUNT:
+	    enquire.set_weighting_scheme(Xapian::LMAbsDiscountWeight());
+	    break;
+	case WEIGHT_LMDIRICHLET:
+	    enquire.set_weighting_scheme(Xapian::LMDirichletWeight());
+	    break;
+        case WEIGHT_LMJM:
+	    enquire.set_weighting_scheme(Xapian::LMJMWeight());
 	    break;
 	case WEIGHT_PL2:
 	    enquire.set_weighting_scheme(Xapian::PL2Weight());
diff --git a/xapian-core/include/xapian/weight.h b/xapian-core/include/xapian/weight.h
index 9877d67d2..f6055b732 100644
--- a/xapian-core/include/xapian/weight.h
+++ b/xapian-core/include/xapian/weight.h
@@ -250,18 +250,6 @@ class XAPIAN_VISIBILITY_DEFAULT Weight {
     /// Default constructor, needed by subclass constructors.
     Weight() : stats_needed() { }
 
-    /** Type of smoothing to use with the Language Model Weighting scheme.
-     *
-     *  Default is TWO_STAGE_SMOOTHING.
-     */
-    typedef enum {
-	TWO_STAGE_SMOOTHING = 1,
-	DIRICHLET_SMOOTHING = 2,
-	ABSOLUTE_DISCOUNT_SMOOTHING = 3,
-	JELINEK_MERCER_SMOOTHING = 4,
-	DIRICHLET_PLUS_SMOOTHING = 5
-    } type_smoothing;
-
     class Internal;
 
     /** Virtual destructor, because we have virtual methods. */
@@ -1827,112 +1815,288 @@ class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
 };
 
 
-/** Xapian::Weight subclass implementing the Language Model formula.
+/** Language Model weighting with Jelinek-Mercer smoothing.
  *
- * This class implements the "Language Model" Weighting scheme, as
- * described by the early papers on LM by Bruce Croft.
+ * As described in:
  *
- * LM works by comparing the query to a Language Model of the document.
- * The language model itself is parameter-free, though LMWeight takes
- * parameters which specify the smoothing used.
+ * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
+ * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
  */
-class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
+class XAPIAN_VISIBILITY_DEFAULT LMJMWeight : public Weight {
     /// The factor to multiply weights by.
     double factor;
 
-    /** The type of smoothing to use. */
-    type_smoothing select_smoothing;
-
-    // Parameters for handling negative value of log, and for smoothing.
-    double param_log, param_smoothing1, param_smoothing2;
+    /// Parameter controlling the smoothing.
+    double param_lambda;
 
-    // Collection weight.
-    double weight_collection;
+    /// Precalculated multiplier for use in weight calculations.
+    double multiplier;
 
-    LMWeight * clone() const;
+    LMJMWeight* clone() const;
 
     void init(double factor_);
 
   public:
-    /** Construct a LMWeight.
-     *
-     *  @param param_log_	A non-negative parameter controlling how much
-     *				to clamp negative values returned by the log.
-     *				The log is calculated by multiplying the
-     *				actual weight by param_log.  If param_log is
-     *				0.0, then the document length upper bound will
-     *				be used (default: document length upper	bound)
-     *
-     *  @param select_smoothing_	A parameter of type enum
-     *					type_smoothing.  This parameter
-     *					controls which smoothing type to use.
-     *					(default: TWO_STAGE_SMOOTHING)
-     *
-     *  @param param_smoothing1_	A non-negative parameter for smoothing
-     *					whose meaning depends on
-     *					select_smoothing_.  In
-     *					JELINEK_MERCER_SMOOTHING, it plays the
-     *					role of estimation and in
-     *					DIRICHLET_SMOOTHING the role of query
-     *					modelling. (default JELINEK_MERCER,
-     *					ABSOLUTE, TWOSTAGE(0.7),
-     *					DIRCHLET(2000))
-     *
-     *  @param param_smoothing2_	A non-negative parameter which is used
-     *					with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
-     *					smoothing (default: 2000) and as parameter delta to
-     *					control the scale of the tf lower bound in the
-     *					DIRICHLET_PLUS_SMOOTHING (default 0.05).
+    /** Construct a LMJMWeight.
      *
+     *  @param lambda	A parameter strictly between 0 and 1 which linearly
+     *			interpolates between the maximum likelihood model (the
+     *			limit as λ→0) and the collection model (the limit as
+     *			λ→1).
+     *
+     *			Values of λ around 0.1 are apparently optimal for short
+     *			queries and around 0.7 for long queries.  If lambda is
+     *			out of range (i.e. <= 0 or >= 1) then the λ value used
+     *			is chosen dynamically based on the query length using
+     *			the formula:
+     *
+     *			  (query_length - 1) / 10.0
+     *
+     *			The result is clamped to 0.1 for query_length <= 2, and
+     *			to 0.7 for query_length >= 8.
      */
-    // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
-    explicit LMWeight(double param_log_ = 0.0,
-		      type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
-		      double param_smoothing1_ = -1.0,
-		      double param_smoothing2_ = -1.0)
-	: select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
-	  param_smoothing2(param_smoothing2_)
-    {
-	if (param_smoothing1 < 0) param_smoothing1 = 0.7;
-	if (param_smoothing2 < 0) {
-	    if (select_smoothing == TWO_STAGE_SMOOTHING)
-		param_smoothing2 = 2000.0;
-	    else
-		param_smoothing2 = 0.05;
-	}
+    explicit LMJMWeight(double lambda = 0.0) : param_lambda(lambda) {
+	need_stat(QUERY_LENGTH);
 	need_stat(DOC_LENGTH);
-	need_stat(RSET_SIZE);
-	need_stat(TERMFREQ);
-	need_stat(RELTERMFREQ);
-	need_stat(DOC_LENGTH_MAX);
 	need_stat(WDF);
 	need_stat(WDF_MAX);
 	need_stat(COLLECTION_FREQ);
 	need_stat(TOTAL_LENGTH);
-	if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
-	    need_stat(UNIQUE_TERMS);
-	if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
-	    need_stat(DOC_LENGTH_MIN);
+	need_stat(DOC_LENGTH_MIN);
     }
 
+    double get_sumpart(Xapian::termcount wdf,
+		       Xapian::termcount doclen,
+		       Xapian::termcount uniqterm,
+		       Xapian::termcount wdfdocmax) const;
+
+    double get_maxpart() const;
+
     std::string name() const;
     std::string short_name() const;
 
     std::string serialise() const;
-    LMWeight * unserialise(const std::string & serialised) const;
+    LMJMWeight* unserialise(const std::string& serialised) const;
+
+    LMJMWeight* create_from_parameters(const char* params) const;
+};
+
+/** Language Model weighting with Dirichlet or Dir+ smoothing.
+ *
+ * Dirichlet smoothing is as described in:
+ *
+ * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
+ * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
+ *
+ * Dir+ is described in:
+ *
+ * Lv, Y., & Zhai, C. (2011). Lower-bounding term frequency normalization.
+ * International Conference on Information and Knowledge Management.
+ */
+class XAPIAN_VISIBILITY_DEFAULT LMDirichletWeight : public Weight {
+    /// The factor to multiply weights by.
+    double factor;
+
+    /// Parameter controlling the smoothing.
+    double param_mu;
+
+    /// A pseudo TF value to control the scale of the TF lower bound.
+    double param_delta;
+
+    /// Precalculated multiplier for use in weight calculations.
+    double multiplier;
+
+    /** Precalculated offset to add to every sumextra.
+     *
+     * This is needed because the formula can return a negative
+     * term-independent weight.
+     */
+    double extra_offset;
+
+    LMDirichletWeight* clone() const;
+
+    void init(double factor_);
+
+  public:
+    /** Construct a LMDirichletWeight.
+     *
+     *  @param mu	A parameter which is > 0.  Default: 2000
+     *  @param delta	A parameter which is >= 0, which is "a pseudo [wdf]
+     *			value to control the scale of the [wdf] lower bound".
+     *			If this parameter is > 0, then the smoothing is Dir+;
+     *			if it's zero, it's Dirichlet.  Default: 0.05
+     */
+    explicit LMDirichletWeight(double mu = 2000.0, double delta = 0.05)
+	: param_mu(mu), param_delta(delta) {
+	need_stat(QUERY_LENGTH);
+	need_stat(DOC_LENGTH);
+	need_stat(WDF);
+	need_stat(WDF_MAX);
+	need_stat(COLLECTION_FREQ);
+	need_stat(TOTAL_LENGTH);
+	need_stat(DOC_LENGTH_MIN);
+	need_stat(DOC_LENGTH_MAX);
+    }
 
     double get_sumpart(Xapian::termcount wdf,
 		       Xapian::termcount doclen,
 		       Xapian::termcount uniqterm,
 		       Xapian::termcount wdfdocmax) const;
+
+    double get_maxpart() const;
+
+    double get_sumextra(Xapian::termcount doclen,
+			Xapian::termcount,
+			Xapian::termcount) const;
+
+    double get_maxextra() const;
+
+    std::string name() const;
+    std::string short_name() const;
+
+    std::string serialise() const;
+    LMDirichletWeight* unserialise(const std::string& serialised) const;
+
+    LMDirichletWeight* create_from_parameters(const char* params) const;
+};
+
+/** Language Model weighting with Absolute Discount smoothing.
+ *
+ * As described in:
+ *
+ * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
+ * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
+ */
+class XAPIAN_VISIBILITY_DEFAULT LMAbsDiscountWeight : public Weight {
+    /// The factor to multiply weights by.
+    double factor;
+
+    /// Parameter controlling the smoothing.
+    double param_delta;
+
+    /// Precalculated multiplier for use in weight calculations.
+    double multiplier;
+
+    /** Precalculated offset to add to every sumextra.
+     *
+     * This is needed because the formula can return a negative
+     * term-independent weight.
+     */
+    double extra_offset;
+
+    LMAbsDiscountWeight* clone() const;
+
+    void init(double factor_);
+
+  public:
+    /** Construct a LMAbsDiscountWeight.
+     *
+     *  @param delta	A parameter between 0 and 1.  Default: 0.7
+     */
+    explicit LMAbsDiscountWeight(double delta = 0.7) : param_delta(delta) {
+	need_stat(QUERY_LENGTH);
+	need_stat(DOC_LENGTH);
+	need_stat(WDF);
+	need_stat(WDF_MAX);
+	need_stat(COLLECTION_FREQ);
+	need_stat(TOTAL_LENGTH);
+	need_stat(DOC_LENGTH_MIN);
+	need_stat(UNIQUE_TERMS);
+	need_stat(DOC_LENGTH_MAX);
+    }
+
+    double get_sumpart(Xapian::termcount wdf,
+		       Xapian::termcount,
+		       Xapian::termcount uniqterm,
+		       Xapian::termcount wdfdocmax) const;
+
     double get_maxpart() const;
 
     double get_sumextra(Xapian::termcount doclen,
 			Xapian::termcount,
 			Xapian::termcount) const;
+
     double get_maxextra() const;
 
-    LMWeight * create_from_parameters(const char * params) const;
+    std::string name() const;
+    std::string short_name() const;
+
+    std::string serialise() const;
+    LMAbsDiscountWeight* unserialise(const std::string& serialised) const;
+
+    LMAbsDiscountWeight* create_from_parameters(const char* params) const;
+};
+
+/** Language Model weighting with Two Stage smoothing.
+ *
+ * As described in:
+ *
+ * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
+ * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
+ */
+class XAPIAN_VISIBILITY_DEFAULT LM2StageWeight : public Weight {
+    /// The factor to multiply weights by.
+    double factor;
+
+    /// Parameter controlling the smoothing.
+    double param_lambda;
+
+    /// Parameter controlling the smoothing.
+    double param_mu;
+
+    /// Precalculated multiplier for use in weight calculations.
+    double multiplier;
+
+    /** Precalculated offset to add to every sumextra.
+     *
+     * This is needed because the formula can return a negative
+     * term-independent weight.
+     */
+    double extra_offset;
+
+    LM2StageWeight* clone() const;
+
+    void init(double factor_);
+
+  public:
+    /** Construct a LM2StageWeight.
+     *
+     *  @param lambda	A parameter between 0 and 1 which linearly interpolates
+     *			between the maximum likelihood model (at 0) and the
+     *			collection model (at 1).  Default: 0.7
+     *  @param mu	A parameter which is greater than 0.  Default: 2000
+     */
+    explicit LM2StageWeight(double lambda = 0.7, double mu = 2000.0)
+	: param_lambda(lambda), param_mu(mu)
+    {
+	need_stat(QUERY_LENGTH);
+	need_stat(DOC_LENGTH);
+	need_stat(WDF);
+	need_stat(WDF_MAX);
+	need_stat(COLLECTION_FREQ);
+	need_stat(TOTAL_LENGTH);
+	need_stat(DOC_LENGTH_MIN);
+	need_stat(DOC_LENGTH_MAX);
+    }
+
+    double get_sumpart(Xapian::termcount wdf,
+		       Xapian::termcount doclen,
+		       Xapian::termcount uniqterm,
+		       Xapian::termcount wdfdocmax) const;
+    double get_maxpart() const;
+
+    double get_sumextra(Xapian::termcount doclen,
+			Xapian::termcount uniqterm,
+			Xapian::termcount wdfdocmax) const;
+    double get_maxextra() const;
+
+    std::string name() const;
+    std::string short_name() const;
+
+    std::string serialise() const;
+    LM2StageWeight* unserialise(const std::string& serialised) const;
+
+    LM2StageWeight* create_from_parameters(const char* params) const;
 };
 
 /** Xapian::Weight subclass implementing Coordinate Matching.
diff --git a/xapian-core/tests/api_nodb.cc b/xapian-core/tests/api_nodb.cc
index 0863d29d2..70609d6e2 100644
--- a/xapian-core/tests/api_nodb.cc
+++ b/xapian-core/tests/api_nodb.cc
@@ -3,7 +3,7 @@
  */
 /* Copyright 1999,2000,2001 BrightStation PLC
  * Copyright 2002 Ananova Ltd
- * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2015,2016,2017,2019 Olly Betts
+ * Copyright 2002-2024 Olly Betts
  * Copyright 2006 Lemur Consulting Ltd
  *
  * This program is free software; you can redistribute it and/or
diff --git a/xapian-core/tests/api_weight.cc b/xapian-core/tests/api_weight.cc
index d59fe0451..1f772691d 100644
--- a/xapian-core/tests/api_weight.cc
+++ b/xapian-core/tests/api_weight.cc
@@ -135,9 +135,14 @@ DEFINE_TESTCASE(weightserialisation1, !backend) {
     TEST_WEIGHT_CLASS(Xapian::PL2PlusWeight,
 		      (1.0, 0.8),
 		      (2.0, 0.9));
-    TEST_WEIGHT_CLASS(Xapian::LMWeight,
-		      (0.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0.7, 2000.0),
-		      (0.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0.7));
+    TEST_WEIGHT_CLASS(Xapian::LM2StageWeight,
+		      (0.7, 2000.0),
+		      (0.5, 2000.0));
+    TEST_WEIGHT_CLASS(Xapian::LMAbsDiscountWeight, (0.7), (0.75));
+    TEST_WEIGHT_CLASS(Xapian::LMDirichletWeight,
+		      (2000.0, 0.05),
+		      (2034.0, 0.0));
+    TEST_WEIGHT_CLASS(Xapian::LMJMWeight, (0.0), (0.5));
 }
 
 /// Basic test of using weighting schemes.
@@ -176,9 +181,26 @@ DEFINE_TESTCASE(weight1, backend) {
 	}
 	Xapian::MSet mset_scaled = enquire_scaled.get_mset(0, expected_matches);
 	TEST_EQUAL(mset_scaled.size(), expected_matches);
+	auto lm = name.find("::LM");
+	// All the LM* schemes have sumextra except LMJMWeight.
+	//
+	// BM25 and BM25+ have sumextra, but by default k2 is 0 which means
+	// sumextra is zero too.
+	bool has_sumextra = lm != string::npos && name[lm + 4] != 'J';
 	for (Xapian::doccount i = 0; i < expected_matches; ++i) {
-	    TEST_EQUAL_DOUBLE(mset_scaled[i].get_weight(),
-			      mset[i].get_weight() * 15.0);
+	    double w = mset[i].get_weight();
+	    double ws = mset_scaled[i].get_weight();
+	    if (has_sumextra) {
+		// sumextra is not scaled, so we can't test for (near)
+		// equality, but we can test that the weight is affected by the
+		// scaling, and that it's between the unscaled weight and the
+		// fully scaled weight.
+		TEST_NOT_EQUAL_DOUBLE(ws, w);
+		TEST_REL(ws, <=, w * 15.0);
+		TEST_REL(ws, >=, w);
+	    } else {
+		TEST_EQUAL_DOUBLE(ws, w * 15.0);
+	    }
 	}
     };
 
@@ -201,7 +223,10 @@ DEFINE_TESTCASE(weight1, backend) {
     TEST_WEIGHTING_SCHEME(Xapian::BB2Weight);
     TEST_WEIGHTING_SCHEME(Xapian::PL2Weight);
     TEST_WEIGHTING_SCHEME(Xapian::PL2PlusWeight);
-    TEST_WEIGHTING_SCHEME(Xapian::LMWeight);
+    TEST_WEIGHTING_SCHEME(Xapian::LM2StageWeight);
+    TEST_WEIGHTING_SCHEME(Xapian::LMAbsDiscountWeight);
+    TEST_WEIGHTING_SCHEME(Xapian::LMDirichletWeight);
+    TEST_WEIGHTING_SCHEME(Xapian::LMJMWeight);
     // Regression test for bug fixed in 1.2.4.
     TEST_WEIGHTING_SCHEME(Xapian::BM25Weight, 0, 0, 0, 0, 1);
     /* As mentioned in the documentation, when parameter k is 0, wdf and
@@ -1640,78 +1665,18 @@ DEFINE_TESTCASE(checkstatsweight5, backend && !multi && !remote) {
     Xapian::MSet mset3 = enquire.get_mset(0, db.get_doccount());
 }
 
-// Two stage should perform same as Jelinek mercer if smoothing parameter for mercer is kept 1 in both.
-DEFINE_TESTCASE(unigramlmweight4, backend) {
+// Feature test for Dir+ weighting.
+DEFINE_TESTCASE(lmdirichletweight1, backend) {
     Xapian::Database db = get_database("apitest_simpledata");
     Xapian::Enquire enquire1(db);
     Xapian::Enquire enquire2(db);
     enquire1.set_query(Xapian::Query("paragraph"));
-    Xapian::MSet mset1;
     enquire2.set_query(Xapian::Query("paragraph"));
-    Xapian::MSet mset2;
-    // 5 documents available with term paragraph so mset size should be 5
-    enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::TWO_STAGE_SMOOTHING, 1, 0));
-    enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 1, 0));
-    mset1 = enquire1.get_mset(0, 10);
-    mset2 = enquire2.get_mset(0, 10);
-
-    TEST_EQUAL(mset1.size(), 5);
-    TEST_EQUAL_DOUBLE(mset1[1].get_weight(), mset2[1].get_weight());
-}
-
-/* Test for checking if we don't use smoothing all
- * of them should give same result i.e wdf_double/len_double */
-DEFINE_TESTCASE(unigramlmweight5, backend) {
-    Xapian::Database db = get_database("apitest_simpledata");
-    Xapian::Enquire enquire1(db);
-    Xapian::Enquire enquire2(db);
-    Xapian::Enquire enquire3(db);
-    Xapian::Enquire enquire4(db);
-    enquire1.set_query(Xapian::Query("paragraph"));
     Xapian::MSet mset1;
-    enquire2.set_query(Xapian::Query("paragraph"));
     Xapian::MSet mset2;
-    enquire3.set_query(Xapian::Query("paragraph"));
-    Xapian::MSet mset3;
-    enquire4.set_query(Xapian::Query("paragraph"));
-    Xapian::MSet mset4;
-    // 5 documents available with term paragraph so mset size should be 5
-    enquire1.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::TWO_STAGE_SMOOTHING, 0, 0));
-    enquire2.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::JELINEK_MERCER_SMOOTHING, 0, 0));
-    enquire3.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING, 0, 0));
-    enquire4.set_weighting_scheme(Xapian::LMWeight(10000.0, Xapian::Weight::DIRICHLET_SMOOTHING, 0, 0));
-
-    mset1 = enquire1.get_mset(0, 10);
-    mset2 = enquire2.get_mset(0, 10);
-    mset3 = enquire3.get_mset(0, 10);
-    mset4 = enquire4.get_mset(0, 10);
 
-    TEST_EQUAL(mset1.size(), 5);
-    TEST_EQUAL(mset2.size(), 5);
-    TEST_EQUAL(mset3.size(), 5);
-    TEST_EQUAL(mset4.size(), 5);
-    for (Xapian::doccount i = 0; i < 5; ++i) {
-	TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset4[i].get_weight());
-	TEST_EQUAL_DOUBLE(mset2[i].get_weight(), mset4[i].get_weight());
-	TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset2[i].get_weight());
-	TEST_EQUAL_DOUBLE(mset3[i].get_weight(), mset2[i].get_weight());
-	TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset4[i].get_weight());
-	TEST_EQUAL_DOUBLE(mset1[i].get_weight(), mset3[i].get_weight());
-    }
-}
-
-// Feature test for Dir+ function.
-DEFINE_TESTCASE(unigramlmweight7, backend) {
-    Xapian::Database db = get_database("apitest_simpledata");
-    Xapian::Enquire enquire1(db);
-    Xapian::Enquire enquire2(db);
-    enquire1.set_query(Xapian::Query("paragraph"));
-    enquire2.set_query(Xapian::Query("paragraph"));
-    Xapian::MSet mset1;
-    Xapian::MSet mset2;
-
-    enquire1.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
-    enquire2.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_PLUS_SMOOTHING, 2000, 0.05));
+    enquire1.set_weighting_scheme(Xapian::LMDirichletWeight(2000, 0));
+    enquire2.set_weighting_scheme(Xapian::LMDirichletWeight(2000, 0.05));
 
     mset1 = enquire1.get_mset(0, 10);
     mset2 = enquire2.get_mset(0, 10);
@@ -1720,38 +1685,13 @@ DEFINE_TESTCASE(unigramlmweight7, backend) {
     TEST_EQUAL(mset1.size(), 5);
     TEST_EQUAL(mset2.size(), 5);
 
-    // Expect mset weights associated with Dir+ more than mset weights by Dir
-    // because of the presence of extra weight component in Dir+ function.
-    TEST_REL(mset2[0].get_weight(),>,mset1[0].get_weight());
-    TEST_REL(mset2[1].get_weight(),>,mset1[1].get_weight());
-    TEST_REL(mset2[2].get_weight(),>,mset1[2].get_weight());
-    TEST_REL(mset2[3].get_weight(),>,mset1[3].get_weight());
-    TEST_REL(mset2[4].get_weight(),>,mset1[4].get_weight());
-}
-
-// Regression test that OP_SCALE_WEIGHT works with LMWeight (fixed in 1.4.1).
-DEFINE_TESTCASE(unigramlmweight8, backend) {
-    Xapian::Database db = get_database("apitest_simpledata");
-    Xapian::Enquire enquire(db);
-    Xapian::Query query("paragraph");
-
-    enquire.set_query(query);
-    enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
-
-    Xapian::MSet mset1;
-    mset1 = enquire.get_mset(0, 10);
-    TEST_EQUAL(mset1.size(), 5);
-
-    enquire.set_query(Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 15.0));
-    enquire.set_weighting_scheme(Xapian::LMWeight(0, Xapian::Weight::DIRICHLET_SMOOTHING, 2000, 0));
-
-    Xapian::MSet mset2;
-    mset2 = enquire.get_mset(0, 10);
-    TEST_EQUAL(mset2.size(), mset1.size());
-    TEST_NOT_EQUAL_DOUBLE(mset1[0].get_weight(), 0.0);
-    for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
-	TEST_EQUAL_DOUBLE(15.0 * mset1[i].get_weight(), mset2[i].get_weight());
-    }
+    // Expect mset weights from Dir+ to be less than mset weights from
+    // Dirichlet for this testcase.
+    TEST_REL(mset2[0].get_weight(),<,mset1[0].get_weight());
+    TEST_REL(mset2[1].get_weight(),<,mset1[1].get_weight());
+    TEST_REL(mset2[2].get_weight(),<,mset1[2].get_weight());
+    TEST_REL(mset2[3].get_weight(),<,mset1[3].get_weight());
+    TEST_REL(mset2[4].get_weight(),<,mset1[4].get_weight());
 }
 
 // Feature test for CoordWeight.
diff --git a/xapian-core/weight/lmweight.cc b/xapian-core/weight/lmweight.cc
dissimilarity index 86%
index 44b258432..3648b244d 100644
--- a/xapian-core/weight/lmweight.cc
+++ b/xapian-core/weight/lmweight.cc
@@ -1,316 +1,583 @@
-/** @file
- * @brief Xapian::LMWeight class - the Unigram Language Modelling formula.
- */
-/* Copyright (C) 2012 Gaurav Arora
- * Copyright (C) 2016,2019 Olly Betts
- * Copyright (C) 2016 Vivek Pal
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <config.h>
-
-#include "xapian/weight.h"
-#include "weightinternal.h"
-
-#include "debuglog.h"
-#include "omassert.h"
-#include "serialise-double.h"
-#include "stringutils.h"
-
-#include "xapian/error.h"
-
-#include <cmath>
-
-using namespace std;
-
-namespace Xapian {
-
-LMWeight *
-LMWeight::clone() const {
-    return new LMWeight(param_log, select_smoothing, param_smoothing1, param_smoothing2);
-}
-
-void
-LMWeight::init(double factor_)
-{
-    factor = factor_;
-
-    // Storing collection frequency of current term in collection_freq to be
-    // accessed while smoothing of weights for the term, for term not present
-    // in the document.
-    double collection_freq = get_collection_freq();
-
-    // Collection_freq of a term in collection should be always greater than or
-    // equal to zero (Non Negative).
-    AssertRel(collection_freq,>=,0);
-    LOGVALUE(WTCALC, collection_freq);
-
-    // This is the total number of term occurrences in the collection, which we
-    // use for smoothing.
-    Xapian::totallength total_length = get_total_length();
-
-    /* In case the within document frequency of term is zero smoothing will
-     * be required and should be return instead of returning zero, as returning
-     * LM score are multiplication of contribution of all terms, due to absence
-     * of single term whole document is scored zero, hence apply collection
-     * frequency smoothing.
-     */
-    if (rare(total_length == 0)) {
-	AssertEq(collection_freq, 0);
-	weight_collection = 0;
-    } else {
-	weight_collection = double(collection_freq) / total_length;
-    }
-
-    // There can't be more relevant term in collection than total number of
-    // term.
-    AssertRel(weight_collection,<=,1.0);
-
-    /* Setting default values of the param_log to handle negative value of log.
-     * It is considered to be upperbound of document length.
-     * initializing param_log to upperbound of document_length.
-     */
-
-    if (param_log == 0.0) {
-	param_log = get_doclength_upper_bound();
-    }
-
-    /* Since the optimal parameter for Jelinek mercer smoothing
-     * is based on query length, so if query is title query changing
-     * default value of smoothing parameter.
-     */
-
-    if (select_smoothing == JELINEK_MERCER_SMOOTHING ||
-	select_smoothing == TWO_STAGE_SMOOTHING) {
-	if (param_smoothing1 == 0.7) {
-	    if (get_query_length() <= 2) {
-		param_smoothing1 = 0.1;
-	    }
-	}
-    }
-
-    /* param_smoothing1 default value should be 2000 in case
-     * DIRICHLET_SMOOTHING is selected. Tweaking param_smoothing1
-     * if user supply his own value for param_smoothing1 value will not be set
-     * to 2000(default value)
-     */
-    if (select_smoothing == DIRICHLET_SMOOTHING) {
-	if (param_smoothing1 == 0.7) {
-	    param_smoothing1 = 2000;
-	}
-    }
-
-    /* Setting param_smoothing1 and param_smoothing2 default value to used when
-     * DIRICHLET_PLUS_SMOOTHING is selected.*/
-    if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
-	if (param_smoothing1 == 0.7) {
-	    param_smoothing1 = 2000;
-	}
-    }
-}
-
-string
-LMWeight::name() const
-{
-    return "Xapian::LMWeight";
-}
-
-string
-LMWeight::short_name() const
-{
-    return "lm";
-}
-
-string
-LMWeight::serialise() const
-{
-    string result = serialise_double(param_log);
-    result += static_cast<unsigned char>(select_smoothing);
-    result += serialise_double(param_smoothing1);
-    result += serialise_double(param_smoothing2);
-    return result;
-}
-
-LMWeight *
-LMWeight::unserialise(const string & s) const
-{
-    const char *ptr = s.data();
-    const char *end = ptr + s.size();
-    double param_log_ = unserialise_double(&ptr, end);
-    type_smoothing select_smoothing_ = static_cast<type_smoothing>(*(ptr)++);
-    double param_smoothing1_ = unserialise_double(&ptr, end);
-    double param_smoothing2_ = unserialise_double(&ptr, end);
-    if (rare(ptr != end))
-	throw Xapian::SerialisationError("Extra data in LMWeight::unserialise()");
-    return new LMWeight(param_log_, select_smoothing_, param_smoothing1_, param_smoothing2_);
-}
-
-double
-LMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
-		      Xapian::termcount uniqterm, Xapian::termcount) const
-{
-    // Within Document Frequency of the term in document being considered.
-    double wdf_double = wdf;
-    // Length of the Document in terms of number of terms.
-    double len_double = len;
-    // variable to store weight contribution of term in the document scoring for LM.
-    double weight_sum;
-
-    // Calculating weights considering different smoothing option available to user.
-    if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
-	/* Maximum likelihood of current term, weight contribution of term in
-	 * case query term is present in the document.
-	 */
-	double weight_document = wdf_double / len_double;
-	weight_sum = (param_smoothing1 * weight_collection) +
-		     ((1 - param_smoothing1) * weight_document);
-    } else if (select_smoothing == DIRICHLET_SMOOTHING) {
-	weight_sum = (wdf_double + (param_smoothing1 * weight_collection)) /
-		     (len_double + param_smoothing1);
-    } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
-	/* In the Dir+ weighting formula, sumpart weight contribution is :-
-	 *
-	 * sum of log of (1 + (wdf/(param_smoothing1 * weight_collection))) and
-	 * log of (1 + (delta/param_smoothing1 * weight_collection))).
-	 * Since, sum of logs is log of product so weight_sum is calculated as product
-	 * of terms in log in the Dir+ formula.
-	 */
-	weight_sum = (1 + (wdf_double / (param_smoothing1 * weight_collection))) *
-		     (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
-    } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
-	double uniqterm_double = uniqterm;
-	weight_sum = ((((wdf_double - param_smoothing1) > 0) ? (wdf_double - param_smoothing1) : 0) / len_double) + ((param_smoothing1 * weight_collection * uniqterm_double) / len_double);
-    } else {
-	weight_sum = (((1 - param_smoothing1) * (wdf_double + (param_smoothing2 * weight_collection)) / (len_double + param_smoothing2)) + (param_smoothing1 * weight_collection));
-    }
-
-    /* Since LM score is calculated with multiplication, instead of changing
-     * the current implementation log trick have been used to calculate the
-     * product since (sum of log is log of product and since aim is ranking
-     * ranking document by product or log of product won't make a large
-     * difference hence log(product) will be used for ranking.
-     */
-    double product = weight_sum * param_log;
-    return (product > 1.0) ? factor * log(product) : 0;
-}
-
-double
-LMWeight::get_maxpart() const
-{
-    // Variable to store the collection frequency
-    double upper_bound;
-    // Store upper bound on wdf in variable wdf_max
-    double wdf_max = get_wdf_upper_bound();
-
-    // Calculating upper bound considering different smoothing option available to user.
-    if (select_smoothing == JELINEK_MERCER_SMOOTHING) {
-	upper_bound = (param_smoothing1 * weight_collection) + (1 - param_smoothing1);
-    } else if (select_smoothing == DIRICHLET_SMOOTHING) {
-	upper_bound = (get_doclength_upper_bound() + (param_smoothing1 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing1);
-    } else if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
-	upper_bound = (1 + (wdf_max / (param_smoothing1 * weight_collection))) *
-		      (1 + (param_smoothing2 / (param_smoothing1 * weight_collection)));
-    } else if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING) {
-	upper_bound = param_smoothing1 * weight_collection + 1;
-    } else {
-	upper_bound = (((1 - param_smoothing1) * (get_doclength_upper_bound() + (param_smoothing2 * weight_collection)) / (get_doclength_upper_bound() + param_smoothing2)) + (param_smoothing1 * weight_collection));
-    }
-
-    /* Since weight are calculated using log trick, using same with the bounds. Refer
-     * comment in get_sumpart for the details.
-     */
-    double product = upper_bound * param_log;
-    return (product > 1.0) ? factor * log(product) : 1.0;
-}
-
-/* The extra weight component in the Dir+ formula is :-
- *
- * |Q| * log (param_smoothing1 / (|D| + param_smoothing1))
- *
- * where, |Q| is total query length.
- *	  |D| is total document length.
- */
-double
-LMWeight::get_sumextra(Xapian::termcount len,
-		       Xapian::termcount,
-		       Xapian::termcount) const
-{
-    if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
-	double extra_weight = param_smoothing1 / (len + param_smoothing1);
-	return get_query_length() * log(extra_weight);
-    }
-    return 0;
-}
-
-double
-LMWeight::get_maxextra() const
-{
-    if (select_smoothing == DIRICHLET_PLUS_SMOOTHING) {
-	double extra_weight = param_smoothing1 / (get_doclength_lower_bound() + param_smoothing1);
-	return get_query_length() * log(extra_weight);
-    }
-    return 0;
-}
-
-static bool
-type_smoothing_param(const char ** p, Xapian::Weight::type_smoothing * ptr_val)
-{
-    const char* q = *p;
-    char ch = *q++;
-    if (ch < '1' || ch > '5' || C_isdigit(*q)) {
-	return false;
-    }
-    *p = q;
-    static const Xapian::Weight::type_smoothing smooth_tab[5] = {
-	Xapian::Weight::TWO_STAGE_SMOOTHING,
-	Xapian::Weight::DIRICHLET_SMOOTHING,
-	Xapian::Weight::ABSOLUTE_DISCOUNT_SMOOTHING,
-	Xapian::Weight::JELINEK_MERCER_SMOOTHING,
-	Xapian::Weight::DIRICHLET_PLUS_SMOOTHING
-    };
-    *ptr_val = smooth_tab[ch - '1'];
-    return true;
-}
-
-static inline void
-parameter_error(const char* message)
-{
-    Xapian::Weight::Internal::parameter_error(message, "lm");
-}
-
-LMWeight *
-LMWeight::create_from_parameters(const char * p) const
-{
-    if (*p == '\0')
-	return new Xapian::LMWeight();
-    double param_log_ = 0;
-    Xapian::Weight::type_smoothing type = Xapian::Weight::TWO_STAGE_SMOOTHING;
-    double smoothing1 = 0.7;
-    double smoothing2 = 2000;
-    if (!Xapian::Weight::Internal::double_param(&p, &param_log_))
-	parameter_error("Parameter 1 (log) is invalid");
-    if (*p && !type_smoothing_param(&p, &type))
-	parameter_error("Parameter 2 (smoothing_type) is invalid");
-    if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing1))
-	parameter_error("Parameter 3 (smoothing1) is invalid");
-    if (*p && !Xapian::Weight::Internal::double_param(&p, &smoothing2))
-	parameter_error("Parameter 4 (smoothing2) is invalid");
-    if (*p)
-	parameter_error("Extra data after parameter 4");
-    return new Xapian::LMWeight(param_log_, type, smoothing1, smoothing2);
-}
-
-}
+/** @file
+ * @brief Unigram Language Modelling weighting classes
+ */
+/* Copyright (C) 2012 Gaurav Arora
+ * Copyright (C) 2016,2019,2024 Olly Betts
+ * Copyright (C) 2016 Vivek Pal
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <config.h>
+
+#include "xapian/weight.h"
+#include "weightinternal.h"
+
+#include "debuglog.h"
+#include "omassert.h"
+#include "serialise-double.h"
+#include "stringutils.h"
+
+#include "xapian/error.h"
+
+#include <cmath>
+
+using namespace std;
+
+[[noreturn]]
+static inline void
+parameter_error(const char* message, const std::string& scheme)
+{
+    Xapian::Weight::Internal::parameter_error(message, scheme);
+}
+
+/* The equations for Jelinek-Mercer, Dirichlet, and Absolute Discount smoothing
+ * are taken from:
+ *
+ * Zhai, C., & Lafferty, J.D. (2004). A study of smoothing methods for language
+ * models applied to information retrieval. ACM Trans. Inf. Syst., 22, 179-214.
+ *
+ * The equations for Two-Stage smoothing are also from this paper, but aren't
+ * given explicitly there, so we have derived them using the approach
+ * described in the paper.
+ *
+ * Dir+ comes from:
+ *
+ * Lv, Y., & Zhai, C. (2011). Lower-bounding term frequency normalization.
+ * International Conference on Information and Knowledge Management.
+ */
+
+namespace Xapian {
+
+void
+LMJMWeight::init(double factor_)
+{
+    factor = factor_ * get_wqf();
+
+    Xapian::totallength total_length = get_total_length();
+    if (rare(total_length == 0)) {
+	// Avoid dividing by zero in the corner case where the database has no
+	// terms with wdf.
+	AssertEq(get_collection_freq(), 0);
+	multiplier = 0;
+	return;
+    }
+
+    double lambda = param_lambda;
+    if (lambda <= 0.0 || lambda >= 1.0) {
+	auto query_len = get_query_length();
+	if (query_len <= 2) {
+	    lambda = 0.1;
+	} else if (query_len < 8) {
+	    lambda = (query_len - 1) * 0.1;
+	} else {
+	    lambda = 0.7;
+	}
+    }
+
+    // Pre-calculate multiplier.
+    auto collection_freq = get_collection_freq();
+    multiplier = (1.0 - lambda) * total_length / (lambda * collection_freq);
+}
+
+double
+LMJMWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len,
+		        Xapian::termcount, Xapian::termcount) const
+{
+    double w = multiplier * wdf / len;
+    return factor * log(1.0 + w);
+}
+
+double
+LMJMWeight::get_maxpart() const
+{
+    Xapian::termcount wdf_max = get_wdf_upper_bound();
+    Xapian::termcount len_min = get_doclength_lower_bound();
+    double w = multiplier;
+    if (wdf_max < len_min) {
+	// Clearly wdf / len <= wdf_max / len_min, but we also know that
+	// wdf <= len so wdf / len <= 1 so we use whichever bound is tighter.
+	w *= double(wdf_max) / len_min;
+    }
+    return factor * log(1.0 + w);
+}
+
+LMJMWeight*
+LMJMWeight::clone() const {
+    return new LMJMWeight(param_lambda);
+}
+
+string
+LMJMWeight::name() const
+{
+    return "Xapian::LMJMWeight";
+}
+
+string
+LMJMWeight::short_name() const
+{
+    return "lmjm";
+}
+
+string
+LMJMWeight::serialise() const
+{
+    return serialise_double(param_lambda);
+}
+
+LMJMWeight*
+LMJMWeight::unserialise(const string& s) const
+{
+    const char *ptr = s.data();
+    const char *end = ptr + s.size();
+    double lambda = unserialise_double(&ptr, end);
+    if (rare(ptr != end))
+	throw Xapian::SerialisationError("Extra data in "
+					 "LMJMWeight::unserialise()");
+    return new LMJMWeight(lambda);
+}
+
+
+LMJMWeight*
+LMJMWeight::create_from_parameters(const char* p) const
+{
+    double lambda = 0.0;
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda))
+	parameter_error("Parameter lambda is invalid", "lmjm");
+    if (*p)
+	parameter_error("Extra data after parameter", "lmjm");
+    return new Xapian::LMJMWeight(lambda);
+}
+
+void
+LMDirichletWeight::init(double factor_)
+{
+    factor = factor_ * get_wqf();
+
+    double mu = param_mu;
+
+    auto doclen_max = get_doclength_upper_bound();
+    extra_offset = get_query_length() * log(doclen_max + mu);
+    if (factor == 0.0) {
+	// This object is for the term-independent contribution.
+	return;
+    }
+
+    Xapian::totallength total_length = get_total_length();
+    if (rare(total_length == 0)) {
+	// Avoid dividing by zero in the corner case where the database has no
+	// terms with wdf.
+	AssertEq(get_collection_freq(), 0);
+	multiplier = 0;
+	return;
+    }
+
+    // Pre-calculate the factor to multiply by.
+    multiplier = total_length / (get_collection_freq() * mu);
+
+    double delta = param_delta;
+    if (delta != 0.0) {
+	// Include the query-independent Dir+ extra contribution in factor.
+	factor *= log(1.0 + delta * multiplier);
+    }
+}
+
+double
+LMDirichletWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount,
+			       Xapian::termcount, Xapian::termcount) const
+{
+    return factor * log(1.0 + wdf * multiplier);
+}
+
+double
+LMDirichletWeight::get_maxpart() const
+{
+    Xapian::termcount wdf_max = get_wdf_upper_bound();
+    return factor * log(1.0 + wdf_max * multiplier);
+}
+
+double
+LMDirichletWeight::get_sumextra(Xapian::termcount doclen,
+				Xapian::termcount,
+				Xapian::termcount) const
+{
+    // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
+    // constant" which is:
+    //
+    //   query_len * log(mu / (doclen + mu))
+    //
+    // (The equation for Dir+ is the same.)
+    //
+    // Both mu and doclen are positive, so this value will be negative but
+    // get_sumextra() must return a non-negative value.  To solve this we
+    // decompose this into:
+    //
+    //   query_len * log(mu) - query_len * log(doclen + mu)
+    //
+    // The first part is constant for a given query so we ignore it.  The second
+    // part is still negative, but we can add query_len * log(doclen_max + mu)
+    // (also constant for a given query) to address that, giving:
+    //
+    //   query_len * log(doclen_max + mu) - query_len * log(doclen + mu)
+    //
+    // We pre-calculate the first part in init() and save it in member variable
+    // extra_weight.
+    return extra_offset - get_query_length() * log(doclen + param_mu);
+}
+
+double
+LMDirichletWeight::get_maxextra() const
+{
+    auto doclen_min = get_doclength_lower_bound();
+    return extra_offset - get_query_length() * log(doclen_min + param_mu);
+}
+
+LMDirichletWeight*
+LMDirichletWeight::clone() const {
+    return new LMDirichletWeight(param_mu, param_delta);
+}
+
+string
+LMDirichletWeight::name() const
+{
+    return "Xapian::LMDirichletWeight";
+}
+
+string
+LMDirichletWeight::short_name() const
+{
+    return "lmdirichlet";
+}
+
+string
+LMDirichletWeight::serialise() const
+{
+    string result = serialise_double(param_mu);
+    result += serialise_double(param_delta);
+    return result;
+}
+
+LMDirichletWeight*
+LMDirichletWeight::unserialise(const string& s) const
+{
+    const char *ptr = s.data();
+    const char *end = ptr + s.size();
+    double mu = unserialise_double(&ptr, end);
+    double delta = unserialise_double(&ptr, end);
+    if (rare(ptr != end))
+	throw Xapian::SerialisationError("Extra data in "
+					 "LMDirichletWeight::unserialise()");
+    return new LMDirichletWeight(mu, delta);
+}
+
+
+LMDirichletWeight*
+LMDirichletWeight::create_from_parameters(const char* p) const
+{
+    double mu = 2000.0;
+    double delta = 0.05;
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &mu))
+	parameter_error("Parameter mu is invalid", "lmdirichlet");
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
+	parameter_error("Parameter delta is invalid", "lmdirichlet");
+    if (*p)
+	parameter_error("Extra data after parameters", "lmdirichlet");
+    return new Xapian::LMDirichletWeight(mu, delta);
+}
+
+void
+LMAbsDiscountWeight::init(double factor_)
+{
+    factor = factor_ * get_wqf();
+
+    auto doclen_max = get_doclength_upper_bound();
+    extra_offset = get_query_length() * log(double(doclen_max));
+
+    Xapian::totallength total_length = get_total_length();
+    if (rare(total_length == 0)) {
+	// Avoid dividing by zero in the corner case where the database has no
+	// terms with wdf.
+	AssertEq(get_collection_freq(), 0);
+	multiplier = 0;
+	return;
+    }
+
+    // Pre-calculate the factor to multiply by.
+    multiplier = total_length / (param_delta * get_collection_freq());
+}
+
+double
+LMAbsDiscountWeight::get_sumpart(Xapian::termcount wdf,
+				 Xapian::termcount,
+				 Xapian::termcount uniqterms,
+				 Xapian::termcount) const
+{
+    return factor * log(1.0 + (wdf - param_delta) / uniqterms * multiplier);
+}
+
+double
+LMAbsDiscountWeight::get_maxpart() const
+{
+    Xapian::termcount doclen_min = get_doclength_lower_bound();
+    Xapian::termcount wdf_max = get_wdf_upper_bound();
+    double x = (wdf_max - param_delta) * multiplier;
+    // We need a lower bound on uniqterms.  We have doclen = sum(wdf) so:
+    //
+    //   uniqterms >= ceil(doclen_min / wdf_max)
+    //
+    // We also know uniqterms >= 1 (since documents without terms won't match).
+    if (doclen_min > wdf_max)
+	x *= (doclen_min - 1) / wdf_max + 1;
+    return factor * log(1.0 + x);
+}
+
+double
+LMAbsDiscountWeight::get_sumextra(Xapian::termcount doclen,
+				  Xapian::termcount uniqterms,
+				  Xapian::termcount) const
+{
+    // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
+    // constant" which is:
+    //
+    //   query_len * log(delta * uniqterms / doclen)
+    //
+    // We know delta < 1 and uniqterms <= doclen so this value will be negative
+    // but get_sumextra() must return a non-negative value.  To solve this we
+    // decompose this into:
+    //
+    //   query_len * log(delta) + query_len * log(uniqterms / doclen)
+    //
+    // The first part is constant for a given query so we ignore it.  The second
+    // part is still negative, but we can add query_len * log(1 / doclen_min)
+    // (also constant for a given query) to address that, giving:
+    //
+    //   -query_len * log(1 / doclen_max) + query_len * log(uniqterms / doclen)
+    //      = query_len * log(doclen_max) + query_len * log(uniqterms / doclen)
+    //
+    // We pre-calculate the first part in init() and save it in member variable
+    // extra_weight.
+    return extra_offset + get_query_length() * log(double(uniqterms) / doclen);
+}
+
+double
+LMAbsDiscountWeight::get_maxextra() const
+{
+    // Our best bound seems to be based on uniqterms <= doclen, which gives
+    // log(uniqterms / doclen> <= 0
+    return extra_offset;
+}
+
+LMAbsDiscountWeight*
+LMAbsDiscountWeight::clone() const {
+    return new LMAbsDiscountWeight(param_delta);
+}
+
+string
+LMAbsDiscountWeight::name() const
+{
+    return "Xapian::LMAbsDiscountWeight";
+}
+
+string
+LMAbsDiscountWeight::short_name() const
+{
+    return "lmabsdiscount";
+}
+
+string
+LMAbsDiscountWeight::serialise() const
+{
+    return serialise_double(param_delta);
+}
+
+LMAbsDiscountWeight*
+LMAbsDiscountWeight::unserialise(const string& s) const
+{
+    const char *ptr = s.data();
+    const char *end = ptr + s.size();
+    double delta = unserialise_double(&ptr, end);
+    if (rare(ptr != end))
+	throw Xapian::SerialisationError("Extra data in "
+					 "LMAbsDiscountWeight::unserialise()");
+    return new LMAbsDiscountWeight(delta);
+}
+
+
+LMAbsDiscountWeight*
+LMAbsDiscountWeight::create_from_parameters(const char* p) const
+{
+    double delta = 0.7;
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &delta))
+	parameter_error("Parameter delta is invalid", "lmabsdiscount");
+    if (*p)
+	parameter_error("Extra data after parameter", "lmabsdiscount");
+    return new Xapian::LMAbsDiscountWeight(delta);
+}
+
+void
+LM2StageWeight::init(double factor_)
+{
+    factor = factor_ * get_wqf();
+
+    double lambda = param_lambda;
+    double mu = param_mu;
+
+    auto doclen_max = get_doclength_upper_bound();
+    extra_offset = -log((lambda * doclen_max + mu) / (doclen_max + mu));
+    extra_offset *= get_query_length();
+
+    Xapian::totallength total_length = get_total_length();
+    if (rare(total_length == 0)) {
+	// Avoid dividing by zero in the corner case where the database has no
+	// terms with wdf.
+	AssertEq(get_collection_freq(), 0);
+	multiplier = 0;
+	return;
+    }
+
+    // Pre-calculate the factor to multiply by.
+    multiplier = (1 - lambda) * total_length / get_collection_freq();
+}
+
+double
+LM2StageWeight::get_sumpart(Xapian::termcount wdf,
+			    Xapian::termcount doclen,
+			    Xapian::termcount,
+			    Xapian::termcount) const
+{
+    // Termweight formula is: log{ 1 + (1-λ) c(w;d) / ( (λ|d|+μ) p(w|C) ) }
+    double lambda = param_lambda;
+    double mu = param_mu;
+    return factor * log(1.0 + wdf / (lambda * doclen + mu) * multiplier);
+}
+
+double
+LM2StageWeight::get_maxpart() const
+{
+    double lambda = param_lambda;
+    double mu = param_mu;
+    Xapian::termcount doclen_min = get_doclength_lower_bound();
+    Xapian::termcount wdf_max = get_wdf_upper_bound();
+    // We know wdf <= doclen so if the bounds don't rule out them being equal
+    // we want to find wdf value w to maximise (w / (lambda * w + mu)) which is
+    // just a case of maximising w, i.e. wdf_max.  Otherwise we evaluate at
+    // wdf = wdf_max, doclen = doclen_min.
+    double x = wdf_max / (lambda * max(doclen_min, wdf_max) + mu);
+    return factor * log(1.0 + x * multiplier);
+}
+
+double
+LM2StageWeight::get_sumextra(Xapian::termcount doclen,
+			     Xapian::termcount,
+			     Xapian::termcount) const
+{
+    // Formula (6) in Zhai and Lafferty 2004 includes "a document-dependent
+    // constant" which is:
+    //
+    //   query_len * log αd
+    //
+    // where αd = ( λ|d| + μ ) / ( |d| + μ ) for two-stage smoothing, so:
+    //
+    //   query_len * log{ (lamba * doclen + mu) / (doclen + mu) }
+    //
+    // We know lambda < 1 so this will be negative but get_sumextra() must
+    // return a non-negative value.  To achieve this we subtract:
+    //
+    //   query_len * log{ (lamba * doclen_max + mu) / (doclen_max + mu) }
+    //
+    // We pre-calculate the negation of this and store it in extra_offset.
+    double lambda = param_lambda;
+    double mu = param_mu;
+    return extra_offset +
+	   get_query_length() * log((lambda * doclen + mu) / (doclen + mu));
+}
+
+double
+LM2StageWeight::get_maxextra() const
+{
+    // Our best bound is at the doclen value which maximises
+    //
+    //   log((lambda * doclen + mu) / (doclen + mu))
+    //
+    // which means maximising
+    //
+    //   (lambda * doclen + mu) / (doclen + mu)
+    //
+    // Putting d' = doclen + mu we want to maximise
+    //
+    //   lambda + (1 - lambda) * mu / d'
+    //
+    // (1 - lambda) > 0 so this is achieved by minimising d' which means
+    // minimising doclen.
+    double lambda = param_lambda;
+    double mu = param_mu;
+    auto doclen = get_doclength_lower_bound();
+    return extra_offset +
+	   get_query_length() * log((lambda * doclen + mu) / (doclen + mu));
+}
+
+LM2StageWeight*
+LM2StageWeight::clone() const {
+    return new LM2StageWeight(param_lambda, param_mu);
+}
+
+string
+LM2StageWeight::name() const
+{
+    return "Xapian::LM2StageWeight";
+}
+
+string
+LM2StageWeight::short_name() const
+{
+    return "lm2stage";
+}
+
+string
+LM2StageWeight::serialise() const
+{
+    string result = serialise_double(param_lambda);
+    result += serialise_double(param_mu);
+    return result;
+}
+
+LM2StageWeight *
+LM2StageWeight::unserialise(const string & s) const
+{
+    const char *ptr = s.data();
+    const char *end = ptr + s.size();
+    double lambda = unserialise_double(&ptr, end);
+    double mu = unserialise_double(&ptr, end);
+    if (rare(ptr != end))
+	throw Xapian::SerialisationError("Extra data in "
+					 "LM2StageWeight::unserialise()");
+    return new LM2StageWeight(lambda, mu);
+}
+
+LM2StageWeight*
+LM2StageWeight::create_from_parameters(const char* p) const
+{
+    double lambda = 0.7;
+    double mu = 2000.0;
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &lambda))
+	parameter_error("Parameter lambda is invalid", "lm2stage");
+    if (*p && !Xapian::Weight::Internal::double_param(&p, &mu))
+	parameter_error("Parameter mu is invalid", "lm2stage");
+    if (*p)
+	parameter_error("Extra data after parameters", "lm2stage");
+    return new Xapian::LM2StageWeight(lambda, mu);
+}
+
+}
diff --git a/xapian-core/weight/weightinternal.h b/xapian-core/weight/weightinternal.h
index b71fb2869..14ff54cae 100644
--- a/xapian-core/weight/weightinternal.h
+++ b/xapian-core/weight/weightinternal.h
@@ -315,6 +315,7 @@ class Weight::Internal {
 	return true;
     }
 
+    [[noreturn]]
     static void parameter_error(const char * msg,
 				const std::string & scheme) {
 	std::string m(msg);
-- 
2.11.4.GIT