2 * @brief Definition of Feature::Internal class.
4 /* Copyright (C) 2012 Parth Gupta
5 * Copyright (C) 2016 Ayush Tomar
6 * Copyright (C) 2019 Vaibhav Kansagara
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
26 #include "xapian-letor/featurelist.h"
27 #include "featurelist_internal.h"
28 #include "feature_internal.h"
37 using namespace Xapian
;
40 FeatureList::Internal::set_data(const Xapian::Query
& letor_query
,
41 const Xapian::Database
& letor_db
,
42 const Xapian::Document
& letor_doc
)
44 set_query(letor_query
);
46 set_database(letor_db
);
49 std::map
<std::string
, Xapian::termcount
>
50 FeatureList::Internal::compute_termfreq() const
52 std::map
<std::string
, Xapian::termcount
> tf
;
54 Xapian::TermIterator docterms
= featurelist_doc
.termlist_begin();
55 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
56 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
57 docterms
.skip_to(*qt
);
58 if (docterms
!= featurelist_doc
.termlist_end() && *qt
== *docterms
)
59 tf
[*qt
] = docterms
.get_wdf();
64 std::map
<std::string
, double>
65 FeatureList::Internal::compute_inverse_doc_freq() const
67 std::map
<std::string
, double> idf
;
68 Xapian::doccount totaldocs
= featurelist_db
.get_doccount();
70 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
71 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
72 Xapian::doccount df
= featurelist_db
.get_termfreq(*qt
);
74 idf
[*qt
] = log10((double)totaldocs
/ (double)(1 + df
));
79 std::map
<std::string
, Xapian::termcount
>
80 FeatureList::Internal::compute_doc_length() const
82 std::map
<std::string
, Xapian::termcount
> len
;
84 Xapian::termcount title_len
= 0;
85 Xapian::TermIterator dt
= featurelist_doc
.termlist_begin();
86 // reach the iterator to the start of the title terms i.e. prefix "S"
88 for ( ; dt
!= featurelist_doc
.termlist_end(); ++dt
) {
89 if ((*dt
)[0] != 'S') {
90 // We've reached the end of the S-prefixed terms.
93 title_len
+= dt
.get_wdf();
95 len
["title"] = title_len
;
96 Xapian::termcount whole_len
=
97 featurelist_db
.get_doclength(featurelist_doc
.get_docid());
98 len
["whole"] = whole_len
;
99 len
["body"] = whole_len
- title_len
;
103 std::map
<std::string
, Xapian::termcount
>
104 FeatureList::Internal::compute_collection_length() const
106 std::map
<std::string
, Xapian::termcount
> len
;
108 if (!featurelist_db
.get_metadata("collection_len_title").empty() &&
109 !featurelist_db
.get_metadata("collection_len_body").empty() &&
110 !featurelist_db
.get_metadata("collection_len_whole").empty()) {
112 atol(featurelist_db
.get_metadata("collection_len_title").c_str());
114 atol(featurelist_db
.get_metadata("collection_len_body").c_str());
116 atol(featurelist_db
.get_metadata("collection_len_whole").c_str());
118 Xapian::termcount title_len
= 0;
119 Xapian::TermIterator dt
= featurelist_db
.allterms_begin("S");
120 for ( ; dt
!= featurelist_db
.allterms_end("S"); ++dt
) {
121 // because we don't want the unique terms so we want their
122 // original frequencies and i.e. the total size of the title collection.
123 title_len
+= featurelist_db
.get_collection_freq(*dt
);
125 len
["title"] = title_len
;
126 Xapian::termcount whole_len
= featurelist_db
.get_avlength() *
127 featurelist_db
.get_doccount();
128 len
["whole"] = whole_len
;
129 len
["body"] = whole_len
- title_len
;
134 std::map
<std::string
, Xapian::termcount
>
135 FeatureList::Internal::compute_collection_termfreq() const
137 std::map
<std::string
, Xapian::termcount
> tf
;
139 for (Xapian::TermIterator qt
= featurelist_query
.get_unique_terms_begin();
140 qt
!= featurelist_query
.get_terms_end(); ++qt
) {
141 Xapian::termcount coll_tf
= featurelist_db
.get_collection_freq(*qt
);
149 FeatureList::Internal::populate_feature_internal(Feature::Internal
*
152 if (stats_needed
& TERM_FREQUENCY
) {
153 internal_feature
->set_termfreq(compute_termfreq());
155 if (stats_needed
& INVERSE_DOCUMENT_FREQUENCY
) {
156 internal_feature
->set_inverse_doc_freq(compute_inverse_doc_freq());
158 if (stats_needed
& DOCUMENT_LENGTH
) {
159 internal_feature
->set_doc_length(compute_doc_length());
161 if (stats_needed
& COLLECTION_LENGTH
) {
162 internal_feature
->set_collection_length(compute_collection_length());
164 if (stats_needed
& COLLECTION_TERM_FREQ
) {
165 internal_feature
->set_collection_termfreq(
166 compute_collection_termfreq());