Initial import into git.
[galago.git] / cpp / galago / src / BinOrderedBinnedRetrieval.cpp
blob5e2d011591c7497ce2ea7ccf7797e7b8b6e7a253
2 //
3 // BinOrderedBinnedRetrieval
4 //
5 // 5 January 2007 -- tds
6 //
8 #include "BinOrderedBinnedRetrieval.hpp"
9 #include "Logging.hpp"
10 #include <queue>
11 #include <string>
12 #include "Query.hpp"
13 #include "LongAccumulator.hpp"
14 #include "WideAccumulator.hpp"
15 #include "ArrayAccumulators.hpp"
16 #include <math.h>
17 #include <algorithm>
20 // openRead
23 void BinOrderedBinnedRetrieval::openRead( const std::string& indexPath ) {
24 _index.openRead( indexPath );
28 // close
29 //
31 void BinOrderedBinnedRetrieval::close() {
32 _index.close();
36 // _fetchIterators
39 void BinOrderedBinnedRetrieval::_fetchIterators( std::vector<ScoredIterator*>& iterators, const std::vector<QueryTerm>& queryTerms ) {
40 for( int i=0; i<queryTerms.size(); i++ ) {
41 const QueryTerm& term = queryTerms[i];
42 BinOrderedBinnedIterator* iterator = _index.getIterator( term.field, term.text );
43 ScoredIterator* scoredIterator = new ScoredIterator( iterator );
45 if( iterator != 0 && iterator->next() ) {
46 iterators.push_back(scoredIterator);
53 // _processExcessTerms
56 void BinOrderedBinnedRetrieval::_processExcessTerms( std::vector<ScoredIterator*>& iterators, Accumulators& accumulators ) {
57 if( accumulators.maxTerms() < iterators.size() ) {
58 std::sort( iterators.begin(), iterators.end(), ScoredIterator::order() );
60 // process the terms that have the highest scores (and the shortest inverted lists)
61 for( int i = accumulators.maxTerms(); i < iterators.size(); i++ ) {
62 ScoredIterator* iter = iterators[i];
64 do {
65 int score = iter->iterator->bin() * iter->weight;
66 accumulators.processBin( score, iter->iterator );
67 } while( iter->iterator->next() );
69 delete iter;
72 // remove all the terms we've already processed
73 iterators.resize( accumulators.maxTerms() );
78 // _loadQueue
81 void BinOrderedBinnedRetrieval::_loadQueue( siter_queue& queue, std::vector<ScoredIterator*>& iterators, Accumulators& accumulators ) {
82 // set up the bin weights, maxScore for each iterator that remains
83 for( int i=0; i<iterators.size(); i++ ) {
84 ScoredIterator* iter = iterators[i];
86 UINT32 maxScore = iter->iterator->bin() * iter->weight;
87 iter->maxScore = maxScore;
88 iter->index = i;
89 accumulators.setMaxUnseen( i, maxScore );
90 queue.push( iter );
95 // runQuery
96 //
98 std::vector<indri::api::ScoredExtentResult>
99 BinOrderedBinnedRetrieval::runQuery( const std::vector<QueryTerm>& inputTerms,
100 int requested,
101 int threshold ) {
102 std::vector<ScoredIterator*> iterators;
103 siter_queue queue;
104 int maximumAccumulators = 0;
106 // stem/stop query terms
107 std::vector<QueryTerm> queryTerms = _index.processTerms( inputTerms );
109 // fetch iterators for the query terms
110 _fetchIterators( iterators, queryTerms );
111 galago_log_query_terms( iterators.size() );
113 // if we're using Anh/Moffat impact indexes, we need to calculate
114 // extra weights at query time.
115 if( _index.requiresWeighting() )
116 _calculateQueryWeights( _index, iterators, _index.bins() );
117 _assignMaxScore( iterators );
119 // buid an accumulator table that's appropriate for the query length
120 Accumulators* accumulators = 0;
121 if( iterators.size() <= LongAccumulator::maxTerms() ) {
122 accumulators = new ArrayAccumulators<LongAccumulator>( iterators.size() );
123 } else {
124 accumulators = new ArrayAccumulators<WideAccumulator>( iterators.size() );
127 // the standard accumulators table can only handle a few terms, so we have
128 // to process any excess terms first
129 _processExcessTerms( iterators, *accumulators );
130 _loadQueue( queue, iterators, *accumulators );
132 if( threshold >= 0 ) {
133 accumulators->setThresholdScore( threshold );
136 int maxEstimate = 0;
138 // for all remaining terms, we enter the optimized loop
139 while( queue.size() > 0 ) {
140 ScoredIterator* iter = queue.top();
141 queue.pop();
143 // if we can quit now, do it
144 if( accumulators->canIgnoreFuturePostings( requested ) ) {
145 delete iter;
147 while( queue.size() > 0 ) {
148 delete queue.top();
149 queue.pop();
152 break;
155 int score = iter->iterator->bin() * iter->weight;
156 accumulators->processBin( score, iter->index, iter->iterator );
157 bool hasMore = iter->iterator->next();
159 if( hasMore ) {
160 _assignMaxScore( iter );
161 accumulators->setMaxUnseen( iter->index, iter->maxScore );
162 queue.push(iter);
163 } else {
164 accumulators->setMaxUnseen( iter->index, 0 );
165 delete iter;
168 accumulators->calculateThresholdScore( requested );
169 maximumAccumulators = std::max( maximumAccumulators, accumulators->size() );
170 accumulators->trim();
173 std::vector<indri::api::ScoredExtentResult> results = accumulators->topResults( requested );
174 delete accumulators;
175 return results;
179 // getDocument
182 std::string BinOrderedBinnedRetrieval::getDocument( int doc ) {
183 return _index.getDocument(doc);
187 // _calculateQueryWeights
190 void BinOrderedBinnedRetrieval::_calculateQueryWeights( const BinnedIndex& index, std::vector<ScoredIterator*>& scored, int binCount ) {
191 // compute the query weights; totally ad hoc approximation here
193 // 1. determine the maximum count
194 UINT64 maxCount = index.maximumDocumentCount();
196 // 2. compute these log value things
197 indri::utility::greedy_vector<double> logWeights;
198 double maxWeight = 0;
199 for( int i=0; i<scored.size(); i++ ) {
200 double logWeight = log( 1 + (double)maxCount / (double)scored[i]->iterator->documentCount() );
201 logWeights.push_back( logWeight );
202 maxWeight = std::max( maxWeight, logWeight );
205 // 3. bin them to the bin levels
206 for( int i=0; i<scored.size(); i++ ) {
207 double logWeight = logWeights[i];
208 int binned = (int) (binCount * logWeight / maxWeight);
209 scored[i]->weight *= std::max( 1, binned );
214 // _assignMaxScore
217 void BinOrderedBinnedRetrieval::_assignMaxScore( std::vector<ScoredIterator*>& scored ) {
218 for( int i=0; i<scored.size(); i++ ) {
219 _assignMaxScore( scored[i] );
224 // _assignMaxScore
227 void BinOrderedBinnedRetrieval::_assignMaxScore( ScoredIterator* scored ) {
228 scored->maxScore = scored->weight * scored->iterator->bin();