Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-letor / api / featurelist_internal.cc
blob3fd99f841e2d6a4a58e98996cf7338008e797cc3
1 /** @file
2 * @brief Definition of Feature::Internal class.
3 */
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
6 * Copyright (C) 2019 Vaibhav Kansagara
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
24 #include <config.h>
26 #include "xapian-letor/featurelist.h"
27 #include "featurelist_internal.h"
28 #include "feature_internal.h"
30 #include <cmath>
31 #include <cstdio>
32 #include <cstdlib>
33 #include <cstring>
34 #include "debuglog.h"
36 using namespace std;
37 using namespace Xapian;
39 void
40 FeatureList::Internal::set_data(const Xapian::Query & letor_query,
41 const Xapian::Database & letor_db,
42 const Xapian::Document & letor_doc)
44 set_query(letor_query);
45 set_doc(letor_doc);
46 set_database(letor_db);
49 std::map<std::string, Xapian::termcount>
50 FeatureList::Internal::compute_termfreq() const
52 std::map<std::string, Xapian::termcount> tf;
54 Xapian::TermIterator docterms = featurelist_doc.termlist_begin();
55 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
56 qt != featurelist_query.get_terms_end(); ++qt) {
57 docterms.skip_to(*qt);
58 if (docterms != featurelist_doc.termlist_end() && *qt == *docterms)
59 tf[*qt] = docterms.get_wdf();
61 return tf;
64 std::map<std::string, double>
65 FeatureList::Internal::compute_inverse_doc_freq() const
67 std::map<std::string, double> idf;
68 Xapian::doccount totaldocs = featurelist_db.get_doccount();
70 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
71 qt != featurelist_query.get_terms_end(); ++qt) {
72 Xapian::doccount df = featurelist_db.get_termfreq(*qt);
73 if (df != 0)
74 idf[*qt] = log10((double)totaldocs / (double)(1 + df));
76 return idf;
79 std::map<std::string, Xapian::termcount>
80 FeatureList::Internal::compute_doc_length() const
82 std::map<std::string, Xapian::termcount> len;
84 Xapian::termcount title_len = 0;
85 Xapian::TermIterator dt = featurelist_doc.termlist_begin();
86 // reach the iterator to the start of the title terms i.e. prefix "S"
87 dt.skip_to("S");
88 for ( ; dt != featurelist_doc.termlist_end(); ++dt) {
89 if ((*dt)[0] != 'S') {
90 // We've reached the end of the S-prefixed terms.
91 break;
93 title_len += dt.get_wdf();
95 len["title"] = title_len;
96 Xapian::termcount whole_len =
97 featurelist_db.get_doclength(featurelist_doc.get_docid());
98 len["whole"] = whole_len;
99 len["body"] = whole_len - title_len;
100 return len;
103 std::map<std::string, Xapian::termcount>
104 FeatureList::Internal::compute_collection_length() const
106 std::map<std::string, Xapian::termcount> len;
108 if (!featurelist_db.get_metadata("collection_len_title").empty() &&
109 !featurelist_db.get_metadata("collection_len_body").empty() &&
110 !featurelist_db.get_metadata("collection_len_whole").empty()) {
111 len["title"] =
112 atol(featurelist_db.get_metadata("collection_len_title").c_str());
113 len["body"] =
114 atol(featurelist_db.get_metadata("collection_len_body").c_str());
115 len["whole"] =
116 atol(featurelist_db.get_metadata("collection_len_whole").c_str());
117 } else {
118 Xapian::termcount title_len = 0;
119 Xapian::TermIterator dt = featurelist_db.allterms_begin("S");
120 for ( ; dt != featurelist_db.allterms_end("S"); ++dt) {
121 // because we don't want the unique terms so we want their
122 // original frequencies and i.e. the total size of the title collection.
123 title_len += featurelist_db.get_collection_freq(*dt);
125 len["title"] = title_len;
126 Xapian::termcount whole_len = featurelist_db.get_avlength() *
127 featurelist_db.get_doccount();
128 len["whole"] = whole_len;
129 len["body"] = whole_len - title_len;
131 return len;
134 std::map<std::string, Xapian::termcount>
135 FeatureList::Internal::compute_collection_termfreq() const
137 std::map<std::string, Xapian::termcount> tf;
139 for (Xapian::TermIterator qt = featurelist_query.get_unique_terms_begin();
140 qt != featurelist_query.get_terms_end(); ++qt) {
141 Xapian::termcount coll_tf = featurelist_db.get_collection_freq(*qt);
142 if (coll_tf != 0)
143 tf[*qt] = coll_tf;
145 return tf;
148 void
149 FeatureList::Internal::populate_feature_internal(Feature::Internal*
150 internal_feature)
152 if (stats_needed & TERM_FREQUENCY) {
153 internal_feature->set_termfreq(compute_termfreq());
155 if (stats_needed & INVERSE_DOCUMENT_FREQUENCY) {
156 internal_feature->set_inverse_doc_freq(compute_inverse_doc_freq());
158 if (stats_needed & DOCUMENT_LENGTH) {
159 internal_feature->set_doc_length(compute_doc_length());
161 if (stats_needed & COLLECTION_LENGTH) {
162 internal_feature->set_collection_length(compute_collection_length());
164 if (stats_needed & COLLECTION_TERM_FREQ) {
165 internal_feature->set_collection_termfreq(
166 compute_collection_termfreq());