Initial import into git.
[galago.git] / cpp / galago / include / ArrayAccumulators.hpp
blob42f013ae932b16a9fe0f7a5733a014329ffac0bd
1 //
2 // ArrayAccumulators
3 //
4 // 5 January 2007 -- tds
5 //
7 #include "LongArrayAccumulators.hpp"
8 #include "indri/indri-platform.h"
9 #include "lemur/RVLCompress.hpp"
10 #include "Logging.hpp"
11 #include "Accumulators.hpp"
12 #include "Bits.hpp"
13 #include <vector>
14 #include <string>
15 #include <queue>
17 #ifndef GALAGO_ARRAYACCUMULATORS_HPP
18 #define GALAGO_ARRAYACCUMULATORS_HPP
20 template<class _AccumulatorType>
21 class ArrayAccumulators : public Accumulators {
22 public:
23 typedef typename AccumulatorTypes<_AccumulatorType>::term_type term_type;
24 typedef typename AccumulatorTypes<_AccumulatorType>::update_type update_type;
26 private:
27 std::vector<_AccumulatorType> _first;
28 std::vector<_AccumulatorType> _second;
29 std::vector<_AccumulatorType>& _accumulators;
30 std::vector<_AccumulatorType>& _newAccumulators;
32 std::vector<UINT32> _maxUnseen;
33 std::vector<UINT32> _fullUnseen;
34 std::vector<UINT32> _scoreCounts;
36 int _termCount;
37 int _threshold;
38 bool _needsTrim;
39 bool _canIgnorePostings;
41 static inline void _moveNextOrAccumulator( const std::vector<_AccumulatorType>& accumulators,
42 UINT32& nextAccumulator,
43 int& index ) {
44 index++;
46 if( index == accumulators.size() ) {
47 nextAccumulator = MAX_UINT32;
48 } else {
49 nextAccumulator = accumulators[index].document();
53 static inline void _moveNextAndAccumulator( const std::vector<_AccumulatorType>& accumulators,
54 term_type termBit,
55 UINT32& nextAccumulator,
56 int& index ) {
57 while (true) {
58 index++;
60 if( index == accumulators.size() ) {
61 nextAccumulator = MAX_UINT32;
62 break;
63 } else {
64 const _AccumulatorType& accumulator = accumulators[index];
66 if( accumulator.containsTerm( termBit ) ) {
67 continue;
68 } else {
69 nextAccumulator = accumulator.document();
70 break;
77 // move_to_document
79 // Move forward in the inverted list to the next general location where we may be able to
80 // update an accumulator. We do this efficiently by incorporating skip data in the inverted
81 // list.
83 // Skip data comes in compressed and delta-encoded (document, distance) pairs. A
84 // (document, distance) tuple implies that the posting for document is immediately before
85 // binStart + distance. So, if we are looking for nextDocument, and nextDocument > document,
86 // we can safely skip to binStart + distance.
88 // Our strategy is to decode the skip information slightly ahead of nextDocument, so generally
89 // skipDocument is set to something greater than nextDocument upon return. This helps reduce
90 // the overhead of skip processing in the event that there are lots of accumulators.
91 // Notice that the function begins with a quick check to see if the skip data will be useful,
92 // and it returns if it won't be useful.
95 static inline const char* _moveToDocument( const UINT32 nextDocument, UINT32& currentDocument,
96 const char* binStart, const char* binData,
97 UINT32& skipDocument, size_t& skipDistance,
98 const char*& skipData, const char* skipEnd ) {
99 // if the next skip location isn't available, or if it is further
100 // than we need to go, quit now.
101 if( nextDocument <= skipDocument )
102 return binData;
104 #ifndef GALAGO_NO_SKIPS
105 if( skipData == skipEnd )
106 #endif
108 skipDocument = MAX_UINT32;
109 return binData;
112 // we know there is more skip information, so we'll decode it
113 while ( skipData < skipEnd ) {
114 int docDelta;
115 int distDelta;
117 // decode a (document, distance) pair
118 skipData = lemur::utility::RVLCompress::decompress_int( skipData, docDelta );
119 skipData = lemur::utility::RVLCompress::decompress_int( skipData, distDelta );
120 skipDocument += docDelta;
121 skipDistance += distDelta;
123 // if this latest skip information is more than we need, quit,
124 // while storing the next skip point in (skipDocument, skipDistance).
125 if( nextDocument <= skipDocument ) {
126 const char* lastSkip = binStart + skipDistance - distDelta;
128 if( lastSkip < binData ) {
129 return binData;
130 } else {
131 currentDocument = skipDocument - docDelta;
132 return lastSkip;
137 // we're all done with skip data, so go to the point indicated by the last skip
138 if( binData > binStart + skipDistance ) {
139 skipDocument = MAX_UINT32;
140 return binData;
143 currentDocument = skipDocument;
144 skipDocument = MAX_UINT32;
145 return binStart + skipDistance;
149 // _newScore
152 void _newScore( UINT32 score ) {
153 if( _scoreCounts.size() < score+1 ) {
154 _scoreCounts.resize( score+1, 0 );
157 _scoreCounts[score]++;
161 // _updateScore
164 void _updateScore( UINT32 oldScore, UINT32 newScore ) {
165 if( _scoreCounts.size() < newScore+1 ) {
166 _scoreCounts.resize( newScore+1, 0 );
169 _scoreCounts[oldScore]--;
170 _scoreCounts[newScore]++;
174 // _swapAccumulators
177 void _swapAccumulators() {
178 if( &_accumulators == &_first ) {
179 _accumulators = _second;
180 _newAccumulators = _first;
181 } else {
182 _accumulators = _first;
183 _newAccumulators = _second;
186 _newAccumulators.clear();
189 public:
190 ArrayAccumulators( int termCount ) :
191 _accumulators(_first),
192 _newAccumulators(_second)
194 int actual = std::min( termCount, _AccumulatorType::maxTerms() );
195 _threshold = -1;
196 _canIgnorePostings = false;
197 _maxUnseen.resize( actual, _AccumulatorType::maxScore() );
198 _termCount = actual;
199 _needsTrim = false;
203 // setThresholdScore
206 void setThresholdScore( int threshold ) {
207 _threshold = std::max( threshold, _threshold );
211 // threshold
214 int threshold() const {
215 return _threshold;
219 // useAndMode
222 bool useAndMode() {
223 return totalUnseen() < _threshold;
227 // processBin
230 void processBin( int score, BinOrderedBinnedIterator* iterator ) {
231 processBinOr( score, -1, iterator );
235 // processBin
238 void processBin( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
239 if( useAndMode() ) {
240 processBinAnd( score, termIndex, iterator );
241 } else {
242 processBinOr( score, termIndex, iterator );
247 // processBinOr
250 void processBinOr( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
251 galago_logging log;
252 galago_log_bin_start( log, GALAGO_MODE_OR, score, termIndex, iterator->postingsDataLength(), iterator->skipDataLength(), _accumulators.size(), _threshold );
254 assert( termIndex >= 0 );
255 int accIndex=-1;
256 update_type updateValue = _AccumulatorType::buildUpdate( termIndex, score );
257 UINT32 nextAccumulator = 0;
259 _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
260 _newAccumulators.reserve(_accumulators.size() + iterator->binDataLength() / 2 );
262 const char* binData = iterator->binData();
263 const char* binEnd = iterator->binData() + iterator->binDataLength();
264 int document = 0;
266 while( binData < binEnd ) {
267 // decompress the next posting
268 int delta = 0;
269 binData = lemur::utility::RVLCompress::decompress_int( binData, delta );
270 document += delta;
272 galago_log_posting( log );
274 // copy all accumulators that come before the current document
275 while( nextAccumulator < document ) {
276 _newAccumulators.push_back( _accumulators[accIndex] );
277 _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
280 // if this document already appears in the accumulators, update it
281 if( nextAccumulator == document ) {
282 _AccumulatorType& accumulator = _accumulators[accIndex];
283 UINT32 oldScore = accumulator.score();
284 _updateScore( oldScore, oldScore + score );
286 accumulator.update( updateValue );
287 _newAccumulators.push_back( accumulator );
288 _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
290 galago_log_accumulator( log, document, oldScore + score );
291 } else {
292 // otherwise, build a new accumulator for the new document
293 _AccumulatorType newAccumulator( document, updateValue );
294 _newAccumulators.push_back( newAccumulator );
295 _newScore( score );
297 galago_log_accumulator( log, document, score );
301 // some old accumulators may be left; copy those too
302 std::copy( _accumulators.begin() + accIndex, _accumulators.end(), std::back_inserter(_newAccumulators) );
303 // copy the table back into place
304 _swapAccumulators();
306 galago_log_bin_end( log, _accumulators.size() );
310 // processBinAnd
313 void processBinAnd( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
314 galago_logging log;
315 galago_log_bin_start( log, GALAGO_MODE_AND, score, termIndex, iterator->postingsDataLength(), iterator->skipDataLength(), _accumulators.size(), _threshold );
317 int accIndex=0;
319 if( _accumulators.size() == 0 )
320 return;
322 assert( termIndex <= _AccumulatorType::maxTerms() );
323 term_type termBit = _AccumulatorType::buildTerm( termIndex );
324 update_type updateValue = _AccumulatorType::buildUpdate( termIndex, score );
325 UINT32 nextDocument = _accumulators[0].document();
326 size_t skipDistance = 0;
328 const char* binDataStart = iterator->binData();
329 const char* binData = iterator->binData();
330 const char* binEnd = iterator->binData() + iterator->binDataLength();
332 const char* skipData = iterator->skipData();
333 const char* skipEnd = skipData + iterator->skipDataLength();
334 UINT32 skipDocument = 0;
336 // turn off skipping entirely if we have lots of accumulators
337 if( iterator->skipDataLength()/4 <= _accumulators.size() ) {
338 skipDocument = MAX_UINT32;
339 skipData = skipEnd;
342 UINT32 document = 0;
343 binData = _moveToDocument( nextDocument, document, binDataStart, binData, skipDocument, skipDistance, skipData, skipEnd );
345 while( binData < binEnd ) {
346 // decompress the next posting
347 int delta = 0;
348 binData = lemur::utility::RVLCompress::decompress_int( binData, delta );
349 document += delta;
351 galago_log_posting( log );
353 // if this current document is smaller than the next accumulator, we go to the next one
354 // just ignore it and move on (very common case)
355 if( document < nextDocument )
356 continue;
358 // if the current document is after the next accumulator, we
359 // need to move forward to see if current matches a later accumulator
360 while( document > nextDocument )
361 _moveNextAndAccumulator( _accumulators, termBit, nextDocument, accIndex );
363 if( nextDocument == MAX_UINT32 )
364 break;
366 // update the accumulator, if this is a hit
367 if( document == nextDocument ) {
368 _AccumulatorType& accumulator = _accumulators[accIndex];
369 UINT32 oldScore = accumulator.score();
371 accumulator.update( updateValue );
372 _updateScore( oldScore, oldScore + score );
373 galago_log_accumulator( log, document, score );
375 _moveNextAndAccumulator( _accumulators, termBit, nextDocument, accIndex );
378 // try to move the data pointer forward to the next accumulator
379 binData = _moveToDocument( nextDocument, document, binDataStart, binData, skipDocument, skipDistance, skipData, skipEnd );
382 galago_log_bin_end( log, _accumulators.size() );
386 // setMaxUnseen
389 void setMaxUnseen( int termIndex, int score ) {
390 assert( termIndex < _termCount );
391 assert( score <= _AccumulatorType::maxScore() );
393 _maxUnseen[termIndex] = score;
397 // calculateThresholdScore
399 // The threshold is the n^th largest score in the accumulator table (where n = requested).
400 // Any score in the final ranked list will need to be above this computed threshold value.
403 void calculateThresholdScore( int requested ) {
404 int countsSeen = 0;
405 int i;
407 for( i = _scoreCounts.size()-1; i >= 0; i-- ) {
408 countsSeen += _scoreCounts[i];
410 if( countsSeen >= requested )
411 break;
414 #ifndef GALAGO_NO_AND_MODE
415 if( i > 0 )
416 _needsTrim = true;
418 setThresholdScore( i );
419 #else
420 _threshold = -1;
421 #endif
425 // trim
428 void trim() {
429 if( _threshold < 0 || _needsTrim == false )
430 return;
432 #ifdef GALAGO_NO_TRIM
433 return;
434 #endif
436 _AccumulatorType::computeUnseenScoreArray( _fullUnseen, _maxUnseen );
438 #ifdef GALAGO_TRIM_ONCE
439 for( int i=0; i<_accumulators.size(); i++ ) {
440 const _AccumulatorType& accumulator = _accumulators[i];
441 UINT32 score = accumulator.score();
443 if( score >= _threshold )
444 continue;
446 UINT32 seenTerms = accumulator.terms();
447 UINT32 maxValue = score + accumulator.unseen( _fullUnseen );
449 // can't trim until no low scoring document would make it in the topk
450 if ( maxValue >= _threshold )
451 return;
453 #endif
454 _newAccumulators.reserve( _accumulators.size() );
456 // for each accumulator, we compute the max possible value and trim if it can't be achieved
457 for( int i=0; i<_accumulators.size(); i++ ) {
458 const _AccumulatorType& accumulator = _accumulators[i];
459 UINT32 maxValue = accumulator.score() + accumulator.unseen( _fullUnseen );
461 if ( maxValue >= _threshold ) {
462 _newAccumulators.push_back( _accumulators[i] );
466 _swapAccumulators();
467 _needsTrim = false;
471 // size
474 int size() {
475 return _accumulators.size();
479 // totalUnseen
482 int totalUnseen() {
483 int totalUnseen = 0;
485 for( int i=0; i < _maxUnseen.size(); i++ ) {
486 totalUnseen += _maxUnseen[i];
489 return totalUnseen;
493 // canIgnoreFuturePostings
495 // Checks to see if additional postings could possibly change the
496 // top ranks; returns true if no additional postings are needed.
499 bool canIgnoreFuturePostings( int requested ) {
500 #ifdef GALAGO_NO_IGNORE
501 return false;
502 #endif
504 // if we're not in AND mode yet, any new posting could change
505 // the top of the ranked list
506 if( useAndMode() == false )
507 return false;
509 // if we haven't trimmed the accumulators down very much,
510 // it's probably not time to try this yet.
511 if( _accumulators.size() > 3*requested )
512 return false;
514 // if we already determined that we can, there's no use
515 // checking again
516 if( _canIgnorePostings )
517 return true;
519 // decode accumulators
520 std::vector<_AccumulatorType> partials;
521 _AccumulatorType::computeUnseenScoreArray( _fullUnseen, _maxUnseen );
523 for( int i=0; i < _accumulators.size(); i++ ) {
524 const _AccumulatorType& accumulator = _accumulators[i];
525 UINT32 maxValue = accumulator.score() + accumulator.unseen( _fullUnseen );
527 if ( maxValue < _threshold )
528 continue;
530 partials.push_back( accumulator );
533 if( partials.size() == 0 )
534 return false;
536 std::sort( partials.begin(), partials.end() );
538 for( int i=0; i<partials.size()-1; i++ ) {
539 const _AccumulatorType& one = partials[i];
540 const _AccumulatorType& two = partials[i+1];
542 UINT32 oneLimit = one.score() + one.unseen( _fullUnseen );
544 // we already know that one.score < two.score, so
545 // if oneLimit > two.score, one might surpass two
546 // with additional postings
547 if ( oneLimit > two.score() )
548 return false;
550 // if the two scores are different, but additional
551 // information might make them the same, we need to
552 // get additional information
553 if ( one.score() < two.score() && oneLimit >= two.score() )
554 return false;
557 _canIgnorePostings = true;
558 return true;
562 // topResults
565 std::vector<indri::api::ScoredExtentResult> topResults( int requested ) {
566 // find the top accumulators in the list
567 std::priority_queue<indri::api::ScoredExtentResult, std::vector<indri::api::ScoredExtentResult>, indri::api::ScoredExtentResult::score_greater> scores;
568 int i;
570 for( i = 0; i < std::min<size_t>( requested, _accumulators.size() ); i++ ) {
571 int document = _accumulators[i].document();
572 int score = _accumulators[i].score();
574 scores.push( indri::api::ScoredExtentResult( score, document, 0, 0 ) );
577 UINT32 lowScore = (UINT32) scores.top().score;
579 for( ; i < _accumulators.size(); i++ ) {
580 int document = _accumulators[i].document();
581 int score = _accumulators[i].score();
583 if( lowScore < score ) {
584 scores.push( indri::api::ScoredExtentResult( score, document, 0, 0 ) );
585 scores.pop();
586 lowScore = (UINT32) scores.top().score;
590 std::vector<indri::api::ScoredExtentResult> results;
591 results.resize( scores.size() );
593 for( i=scores.size()-1; i>=0; i-- ) {
594 results[i] = scores.top();
595 scores.pop();
598 return results;
602 // maxTerms
605 int maxTerms() const {
606 return _AccumulatorType::maxTerms();
610 #endif // GALAGO_ARRAYACCUMULATORS_HPP