2 // DocumentOrderedBinnedMaxScoreRetrieval
4 // 5 November 2007 -- Trevor Strohman
6 // BSD License (http://galagosearch.org/license)
9 #include "DocumentOrderedBinnedMaxScoreRetrieval.hpp"
10 #include "indri/ScoredExtentResult.hpp"
15 #include "lemur/Exception.hpp"
16 #include "Logging.hpp"
18 #define MAX_TERMS (256)
24 static std::vector
<indri::api::ScoredExtentResult
> _convertQueue( std::priority_queue
<indri::api::ScoredExtentResult
>& queue
) {
25 std::vector
<indri::api::ScoredExtentResult
> result
;
27 while( queue
.size() > 0 ) {
28 result
.push_back( queue
.top() );
32 std::sort( result
.begin(), result
.end() );
37 // removeDoneIterators
39 // Scans through the list of iterators, looking for any that are now done. If an
40 // iterator is done, it is removed from the vector and deleted.
43 static void _removeDoneIterators( std::vector
<DocumentOrderedBinnedIterator
*>& iterators
) {
44 for( int i
=0; i
<iterators
.size(); i
++ ) {
45 if( iterators
[i
]->isDone() ) {
47 iterators
.erase( iterators
.begin() + i
);
56 // For each iterator i, remaining[i] is the total remaining score possible after
57 // iterator i. remaining[n-1] is defined to be 0.
59 // This iterator can be used for early termination. If
63 static void _computeRemaining( int* remaining
,
64 const std::vector
<DocumentOrderedBinnedIterator
*>& iterators
) {
65 if( iterators
.size() > 0 )
66 remaining
[iterators
.size()-1] = 0;
68 for( int i
=iterators
.size()-2; i
>= 0; i
-- ) {
69 remaining
[i
] = remaining
[i
+1] + iterators
[i
+1]->listBound();
76 // For each iterator i, bounds[i] is the required score contribution of that
77 // iterator (and all previous iterators) in order to have a chance of entering
81 static void _computeBounds( int* bounds
,
83 int& unimportantBound
,
84 const std::vector
<DocumentOrderedBinnedIterator
*>& iterators
,
88 important
= iterators
.size();
90 for( int i
=iterators
.size()-1; i
>= 0; i
-- ) {
91 int oldBound
= bounds
[i
];
92 bounds
[i
] = std::max( 0, threshold
- total
);
93 total
+= iterators
[i
]->listBound();
95 if( oldBound
== 0 && bounds
[i
] > 0 ) {
96 galago_log_query_terms( i
);
99 if( i
> 0 && bounds
[i
-1] == 0 && bounds
[i
] > 0 ) {
101 unimportantBound
= bounds
[i
];
105 assert( important
<= iterators
.size() );
106 bounds
[iterators
.size()] = threshold
;
110 static void _postCompletedIterator( std::vector
<DocumentOrderedBinnedIterator
*>& iterators
,
112 int& unimportantBound
,
116 _removeDoneIterators( iterators
);
117 _computeBounds( bounds
, important
, unimportantBound
, iterators
, threshold
);
118 _computeRemaining( remaining
, iterators
);
125 void DocumentOrderedBinnedMaxScoreRetrieval::openRead( const std::string
& indexPath
) {
126 _index
.openRead( indexPath
);
132 // This method contains the core logic for running queries.
134 // Notice that the _scoreSkipping and _conjunctive variables
135 // are actually template parameters. This allows the compiler to strip out large
136 // sections of the code based on these parameter settings, and to schedule the
137 // instructions more efficiently.
139 // This method always at least does Turtle and Flood's max-score optimization.
140 // If _scoreSkipping is enabled, this also does Strohman's score skipping
141 // optimization. If the _conjunctive option is set, it requires all query
142 // terms to appear in any query result.
145 template<bool _scoreSkipping
, bool _conjunctive
>
146 static std::vector
<indri::api::ScoredExtentResult
>
147 _runQuery( std::vector
<DocumentOrderedBinnedIterator
*>& iterators
,
150 galago_logging_docordered log
;
152 threshold
= std::max( threshold
, 0 );
153 assert( requested
> 0 );
155 if( iterators
.size() > MAX_TERMS
) {
156 LEMUR_THROW( LEMUR_BAD_PARAMETER_ERROR
, "Too many terms" );
159 std::priority_queue
<indri::api::ScoredExtentResult
> queue
;
161 // we do fixed allocation for terms here to help out the compiler a little
162 int bounds
[MAX_TERMS
+1];
163 int remaining
[MAX_TERMS
+1];
165 // all iterators < important are the "important" iterators for max score
166 int important
= iterators
.size();
167 // if the score contribution of the important iterators is not at least
168 // unimportantBound, then don't bother with the unimportant iterators
169 int unimportantBound
= 0;
170 int document
= MAX_INT32
;
172 // find the smallest document in any iterator
173 for( int i
=0; i
<iterators
.size(); i
++ ) {
174 document
= std::min( iterators
[i
]->currentDocument(), document
);
177 _computeBounds( bounds
, important
, unimportantBound
, iterators
, threshold
);
178 _computeRemaining( remaining
, iterators
);
181 while (document
< MAX_INT32
) {
186 // assertion: document is the minimum document to which any iterator
187 // in the important set is pointing.
189 // score all of the important iterators
190 for( int i
=0; i
<important
; i
++ ) {
191 DocumentOrderedBinnedIterator
* iter
= iterators
[i
];
192 assert( !iter
->isDone() );
195 if( iter
->currentDocument() == document
) {
196 score
+= iter
->currentScore();
197 } else if( _conjunctive
) {
203 // if the score we've achieved so far is not greater than unimportantBound,
204 // this document will not enter the ranked list. Otherwise we continue
205 // looking at the remaining iterators.
207 if( score
>= unimportantBound
) {
208 // now, fill in remaining scores
209 for( int i
=important
; i
<iterators
.size(); i
++ ) {
212 // this is the amount of score mass we need to continue
213 int required
= std::max( 0, threshold
- score
- remaining
[i
] );
215 DocumentOrderedBinnedIterator
* iter
= iterators
[i
];
216 assert( !iter
->isDone() );
218 // if we're score skipping, we first make sure that we're caught up
219 // on reading skip data. The skip data gives us a bound on the score
220 // contribution from this document. If that bound is less than the
221 // required score from this iterator, we can quit without actually
222 // looking at the posting.
224 if( _scoreSkipping
) {
225 iter
->readSkipsTo( document
);
226 assert( !iter
->isDone() );
227 assert( iter
->currentBoundDocument() >= document
&& iter
->lastBoundDocument() < document
);
229 if( iter
->currentBound() < required
) {
235 assert( !iter
->isDone() );
236 bool foundDocument
= iter
->skipToDocument( document
);
238 // the movement might have caused the iterator to end, so check that
239 if( iter
->isDone() ) {
242 return _convertQueue( queue
);
244 _postCompletedIterator( iterators
, important
, unimportantBound
,
245 bounds
, remaining
, threshold
);
250 assert( !iter
->isDone() );
252 // add in the score from this document
253 if( foundDocument
) {
254 score
+= iter
->currentScore();
255 } else if( _conjunctive
) {
261 // break if there's no way this document can enter the ranked list
262 if( bounds
[i
] > score
) {
269 // push score onto heap if it's a good one
270 if( score
>= threshold
) {
272 queue
.push( indri::api::ScoredExtentResult( score
, document
, 0, 0 ) );
274 while( queue
.size() > requested
) {
277 if( queue
.size() >= 1 ) {
278 int lastThreshold
= threshold
;
279 threshold
= (int)queue
.top().score
;
281 if( lastThreshold
!= threshold
) {
282 _computeBounds( bounds
, important
, unimportantBound
, iterators
, threshold
);
288 // move to the next reasonable document
289 int lastDocument
= document
;
291 if( !_conjunctive
) {
292 document
= MAX_INT32
;
295 // if the first term is required, skip forward to find a usable document
296 if( _scoreSkipping
&& bounds
[0] > 0 ) {
297 DocumentOrderedBinnedIterator
* iter
= iterators
[0];
299 assert(!iter
->isDone());
300 if( iter
->currentDocument() == lastDocument
)
301 iter
->nextDocument();
306 // make sure we're caught up on the skip data
307 iter
->readSkipsTo( iter
->currentDocument() );
309 // skip forward until we find a document that has a score of
310 // at least bounds[0].
311 iter
->skipToBound( bounds
[0] );
312 document
= iter
->currentDocument();
313 log
.scoreBoundSkip
++;
321 // try to find a usable document
322 assert( important
<= iterators
.size() );
323 for( int i
=0; i
<important
; i
++ ) {
324 DocumentOrderedBinnedIterator
* iter
= iterators
[i
];
325 assert( iter
->isDone() == false );
327 // if this iterator is pointing to the last scored document, we need to move it
328 if( iter
->currentDocument() == lastDocument
) {
329 iter
->nextDocument();
331 // might have run off the end of the iterator, so check that
332 if( iter
->isDone() ) {
335 return _convertQueue( queue
);
337 _postCompletedIterator( iterators
, important
, unimportantBound
,
338 bounds
, remaining
, threshold
);
347 assert( !iter
->isDone() );
348 int current
= iter
->currentDocument();
350 if( current
> lastDocument
) {
352 document
= std::max( document
, iter
->currentDocument() );
354 document
= std::min( document
, iter
->currentDocument() );
362 return _convertQueue(queue
);
369 std::vector
<indri::api::ScoredExtentResult
>
370 DocumentOrderedBinnedMaxScoreRetrieval::runQuery( const std::vector
<QueryTerm
>& terms
,
373 std::vector
<DocumentOrderedBinnedIterator
*> iterators
= getIterators( terms
);
374 std::sort( iterators
.begin(), iterators
.end(), DocumentOrderedBinnedIterator::bound_greater() );
375 galago_log_query_terms( iterators
.size() );
377 if( _scoreSkipping
== true && _conjunctive
== true ) {
378 return _runQuery
<true, true>( iterators
, requested
, threshold
);
379 } else if( _scoreSkipping
== true && _conjunctive
== false ) {
380 return _runQuery
<true, false>( iterators
, requested
, threshold
);
381 } else if( _scoreSkipping
== false && _conjunctive
== true ) {
382 return _runQuery
<false, true>( iterators
, requested
, threshold
);
384 return _runQuery
<false, false>( iterators
, requested
, threshold
);
392 std::vector
<DocumentOrderedBinnedIterator
*>
393 DocumentOrderedBinnedMaxScoreRetrieval::getIterators( const std::vector
<QueryTerm
>& terms
) {
394 std::vector
<DocumentOrderedBinnedIterator
*> iterators
;
396 for( int i
=0; i
<terms
.size(); i
++ ) {
397 const QueryTerm
& term
= terms
[i
];
398 DocumentOrderedBinnedIterator
* iterator
= _index
.getTerm( term
.text
, term
.field
);
401 iterators
.push_back( iterator
);
411 std::string
DocumentOrderedBinnedMaxScoreRetrieval::getDocument( int doc
) {
412 return _index
.getDocument(doc
);
419 void DocumentOrderedBinnedMaxScoreRetrieval::close() {