Initial import into git.
[galago.git] / cpp / galago / src / DocumentOrderedBinnedMaxScoreRetrieval.cpp
bloba87746dfe1fbd9982ccd64f0019433c7f8b75807
1 //
2 // DocumentOrderedBinnedMaxScoreRetrieval
3 //
4 // 5 November 2007 -- Trevor Strohman
5 //
6 // BSD License (http://galagosearch.org/license)
7 //
9 #include "DocumentOrderedBinnedMaxScoreRetrieval.hpp"
10 #include "indri/ScoredExtentResult.hpp"
11 #include "Query.hpp"
12 #include <vector>
13 #include <queue>
14 #include <assert.h>
15 #include "lemur/Exception.hpp"
16 #include "Logging.hpp"
18 #define MAX_TERMS (256)
21 // convertQueue
24 static std::vector<indri::api::ScoredExtentResult> _convertQueue( std::priority_queue<indri::api::ScoredExtentResult>& queue ) {
25 std::vector<indri::api::ScoredExtentResult> result;
27 while( queue.size() > 0 ) {
28 result.push_back( queue.top() );
29 queue.pop();
32 std::sort( result.begin(), result.end() );
33 return result;
37 // removeDoneIterators
39 // Scans through the list of iterators, looking for any that are now done. If an
40 // iterator is done, it is removed from the vector and deleted.
43 static void _removeDoneIterators( std::vector<DocumentOrderedBinnedIterator*>& iterators ) {
44 for( int i=0; i<iterators.size(); i++ ) {
45 if( iterators[i]->isDone() ) {
46 delete iterators[i];
47 iterators.erase( iterators.begin() + i );
48 i--;
54 // computeRemaining
56 // For each iterator i, remaining[i] is the total remaining score possible after
57 // iterator i. remaining[n-1] is defined to be 0.
59 // This iterator can be used for early termination. If
60 //
63 static void _computeRemaining( int* remaining,
64 const std::vector<DocumentOrderedBinnedIterator*>& iterators ) {
65 if( iterators.size() > 0 )
66 remaining[iterators.size()-1] = 0;
68 for( int i=iterators.size()-2; i >= 0; i-- ) {
69 remaining[i] = remaining[i+1] + iterators[i+1]->listBound();
74 // computeBounds
76 // For each iterator i, bounds[i] is the required score contribution of that
77 // iterator (and all previous iterators) in order to have a chance of entering
78 // the ranked list.
81 static void _computeBounds( int* bounds,
82 int& important,
83 int& unimportantBound,
84 const std::vector<DocumentOrderedBinnedIterator*>& iterators,
85 int threshold ) {
86 int total = 0;
87 unimportantBound = 0;
88 important = iterators.size();
90 for( int i=iterators.size()-1; i >= 0; i-- ) {
91 int oldBound = bounds[i];
92 bounds[i] = std::max( 0, threshold - total );
93 total += iterators[i]->listBound();
95 if( oldBound == 0 && bounds[i] > 0 ) {
96 galago_log_query_terms( i );
99 if( i > 0 && bounds[i-1] == 0 && bounds[i] > 0 ) {
100 important = i+1;
101 unimportantBound = bounds[i];
105 assert( important <= iterators.size() );
106 bounds[iterators.size()] = threshold;
110 static void _postCompletedIterator( std::vector<DocumentOrderedBinnedIterator*>& iterators,
111 int& important,
112 int& unimportantBound,
113 int* bounds,
114 int* remaining,
115 int threshold ) {
116 _removeDoneIterators( iterators );
117 _computeBounds( bounds, important, unimportantBound, iterators, threshold );
118 _computeRemaining( remaining, iterators );
122 // openRead
125 void DocumentOrderedBinnedMaxScoreRetrieval::openRead( const std::string& indexPath ) {
126 _index.openRead( indexPath );
130 // runQuery template
132 // This method contains the core logic for running queries.
134 // Notice that the _scoreSkipping and _conjunctive variables
135 // are actually template parameters. This allows the compiler to strip out large
136 // sections of the code based on these parameter settings, and to schedule the
137 // instructions more efficiently.
139 // This method always at least does Turtle and Flood's max-score optimization.
140 // If _scoreSkipping is enabled, this also does Strohman's score skipping
141 // optimization. If the _conjunctive option is set, it requires all query
142 // terms to appear in any query result.
145 template<bool _scoreSkipping, bool _conjunctive>
146 static std::vector<indri::api::ScoredExtentResult>
147 _runQuery( std::vector<DocumentOrderedBinnedIterator*>& iterators,
148 int requested,
149 int threshold ) {
150 galago_logging_docordered log;
152 threshold = std::max( threshold, 0 );
153 assert( requested > 0 );
155 if( iterators.size() > MAX_TERMS ) {
156 LEMUR_THROW( LEMUR_BAD_PARAMETER_ERROR, "Too many terms" );
159 std::priority_queue<indri::api::ScoredExtentResult> queue;
161 // we do fixed allocation for terms here to help out the compiler a little
162 int bounds[MAX_TERMS+1];
163 int remaining[MAX_TERMS+1];
165 // all iterators < important are the "important" iterators for max score
166 int important = iterators.size();
167 // if the score contribution of the important iterators is not at least
168 // unimportantBound, then don't bother with the unimportant iterators
169 int unimportantBound = 0;
170 int document = MAX_INT32;
172 // find the smallest document in any iterator
173 for( int i=0; i<iterators.size(); i++ ) {
174 document = std::min( iterators[i]->currentDocument(), document );
177 _computeBounds( bounds, important, unimportantBound, iterators, threshold );
178 _computeRemaining( remaining, iterators );
180 retry:
181 while (document < MAX_INT32) {
182 // score candidate
183 int score = 0;
184 int lastIter = 0;
186 // assertion: document is the minimum document to which any iterator
187 // in the important set is pointing.
189 // score all of the important iterators
190 for( int i=0; i<important; i++ ) {
191 DocumentOrderedBinnedIterator* iter = iterators[i];
192 assert( !iter->isDone() );
193 lastIter = i;
195 if( iter->currentDocument() == document ) {
196 score += iter->currentScore();
197 } else if( _conjunctive ) {
198 score = -1;
199 break;
203 // if the score we've achieved so far is not greater than unimportantBound,
204 // this document will not enter the ranked list. Otherwise we continue
205 // looking at the remaining iterators.
207 if( score >= unimportantBound ) {
208 // now, fill in remaining scores
209 for( int i=important; i<iterators.size(); i++ ) {
210 lastIter = i;
212 // this is the amount of score mass we need to continue
213 int required = std::max( 0, threshold - score - remaining[i] );
215 DocumentOrderedBinnedIterator* iter = iterators[i];
216 assert( !iter->isDone() );
218 // if we're score skipping, we first make sure that we're caught up
219 // on reading skip data. The skip data gives us a bound on the score
220 // contribution from this document. If that bound is less than the
221 // required score from this iterator, we can quit without actually
222 // looking at the posting.
224 if( _scoreSkipping ) {
225 iter->readSkipsTo( document );
226 assert( !iter->isDone() );
227 assert( iter->currentBoundDocument() >= document && iter->lastBoundDocument() < document );
229 if( iter->currentBound() < required ) {
230 score = -1;
231 break;
235 assert( !iter->isDone() );
236 bool foundDocument = iter->skipToDocument( document );
238 // the movement might have caused the iterator to end, so check that
239 if( iter->isDone() ) {
240 if( _conjunctive ) {
241 galago_log( log );
242 return _convertQueue( queue );
243 } else {
244 _postCompletedIterator( iterators, important, unimportantBound,
245 bounds, remaining, threshold );
246 goto retry;
250 assert( !iter->isDone() );
252 // add in the score from this document
253 if( foundDocument ) {
254 score += iter->currentScore();
255 } else if( _conjunctive ) {
256 log.docsScored[i]++;
257 score = -1;
258 break;
261 // break if there's no way this document can enter the ranked list
262 if( bounds[i] > score ) {
263 log.docsScored[i]++;
264 break;
269 // push score onto heap if it's a good one
270 if( score >= threshold ) {
271 log.fullScored++;
272 queue.push( indri::api::ScoredExtentResult( score, document, 0, 0 ) );
274 while( queue.size() > requested ) {
275 queue.pop();
277 if( queue.size() >= 1 ) {
278 int lastThreshold = threshold;
279 threshold = (int)queue.top().score;
281 if( lastThreshold != threshold ) {
282 _computeBounds( bounds, important, unimportantBound, iterators, threshold );
288 // move to the next reasonable document
289 int lastDocument = document;
291 if( !_conjunctive ) {
292 document = MAX_INT32;
295 // if the first term is required, skip forward to find a usable document
296 if( _scoreSkipping && bounds[0] > 0 ) {
297 DocumentOrderedBinnedIterator* iter = iterators[0];
299 assert(!iter->isDone());
300 if( iter->currentDocument() == lastDocument )
301 iter->nextDocument();
303 if( iter->isDone() )
304 break;
306 // make sure we're caught up on the skip data
307 iter->readSkipsTo( iter->currentDocument() );
309 // skip forward until we find a document that has a score of
310 // at least bounds[0].
311 iter->skipToBound( bounds[0] );
312 document = iter->currentDocument();
313 log.scoreBoundSkip++;
315 if( iter->isDone() )
316 break;
318 continue;
321 // try to find a usable document
322 assert( important <= iterators.size() );
323 for( int i=0; i<important; i++ ) {
324 DocumentOrderedBinnedIterator* iter = iterators[i];
325 assert( iter->isDone() == false );
327 // if this iterator is pointing to the last scored document, we need to move it
328 if( iter->currentDocument() == lastDocument ) {
329 iter->nextDocument();
331 // might have run off the end of the iterator, so check that
332 if( iter->isDone() ) {
333 if( _conjunctive ) {
334 galago_log( log );
335 return _convertQueue( queue );
336 } else {
337 _postCompletedIterator( iterators, important, unimportantBound,
338 bounds, remaining, threshold );
339 i--;
342 iter = 0;
346 if( iter != 0 ) {
347 assert( !iter->isDone() );
348 int current = iter->currentDocument();
350 if( current > lastDocument ) {
351 if( _conjunctive ) {
352 document = std::max( document, iter->currentDocument() );
353 } else {
354 document = std::min( document, iter->currentDocument() );
361 galago_log( log );
362 return _convertQueue(queue);
366 // runQuery
369 std::vector<indri::api::ScoredExtentResult>
370 DocumentOrderedBinnedMaxScoreRetrieval::runQuery( const std::vector<QueryTerm>& terms,
371 int requested,
372 int threshold ) {
373 std::vector<DocumentOrderedBinnedIterator*> iterators = getIterators( terms );
374 std::sort( iterators.begin(), iterators.end(), DocumentOrderedBinnedIterator::bound_greater() );
375 galago_log_query_terms( iterators.size() );
377 if( _scoreSkipping == true && _conjunctive == true ) {
378 return _runQuery<true, true>( iterators, requested, threshold );
379 } else if( _scoreSkipping == true && _conjunctive == false ) {
380 return _runQuery<true, false>( iterators, requested, threshold );
381 } else if( _scoreSkipping == false && _conjunctive == true ) {
382 return _runQuery<false, true>( iterators, requested, threshold );
383 } else {
384 return _runQuery<false, false>( iterators, requested, threshold );
389 // getIterators
392 std::vector<DocumentOrderedBinnedIterator*>
393 DocumentOrderedBinnedMaxScoreRetrieval::getIterators( const std::vector<QueryTerm>& terms ) {
394 std::vector<DocumentOrderedBinnedIterator*> iterators;
396 for( int i=0; i<terms.size(); i++ ) {
397 const QueryTerm& term = terms[i];
398 DocumentOrderedBinnedIterator* iterator = _index.getTerm( term.text, term.field );
400 if( iterator )
401 iterators.push_back( iterator );
404 return iterators;
408 // getDocument
411 std::string DocumentOrderedBinnedMaxScoreRetrieval::getDocument( int doc ) {
412 return _index.getDocument(doc);
416 // close
419 void DocumentOrderedBinnedMaxScoreRetrieval::close() {
420 _index.close();