4 // 5 January 2007 -- tds
7 #include "LongArrayAccumulators.hpp"
8 #include "indri/indri-platform.h"
9 #include "lemur/RVLCompress.hpp"
10 #include "Logging.hpp"
11 #include "Accumulators.hpp"
17 #ifndef GALAGO_ARRAYACCUMULATORS_HPP
18 #define GALAGO_ARRAYACCUMULATORS_HPP
20 template<class _AccumulatorType
>
21 class ArrayAccumulators
: public Accumulators
{
23 typedef typename AccumulatorTypes
<_AccumulatorType
>::term_type term_type
;
24 typedef typename AccumulatorTypes
<_AccumulatorType
>::update_type update_type
;
27 std::vector
<_AccumulatorType
> _first
;
28 std::vector
<_AccumulatorType
> _second
;
29 std::vector
<_AccumulatorType
>& _accumulators
;
30 std::vector
<_AccumulatorType
>& _newAccumulators
;
32 std::vector
<UINT32
> _maxUnseen
;
33 std::vector
<UINT32
> _fullUnseen
;
34 std::vector
<UINT32
> _scoreCounts
;
39 bool _canIgnorePostings
;
41 static inline void _moveNextOrAccumulator( const std::vector
<_AccumulatorType
>& accumulators
,
42 UINT32
& nextAccumulator
,
46 if( index
== accumulators
.size() ) {
47 nextAccumulator
= MAX_UINT32
;
49 nextAccumulator
= accumulators
[index
].document();
53 static inline void _moveNextAndAccumulator( const std::vector
<_AccumulatorType
>& accumulators
,
55 UINT32
& nextAccumulator
,
60 if( index
== accumulators
.size() ) {
61 nextAccumulator
= MAX_UINT32
;
64 const _AccumulatorType
& accumulator
= accumulators
[index
];
66 if( accumulator
.containsTerm( termBit
) ) {
69 nextAccumulator
= accumulator
.document();
79 // Move forward in the inverted list to the next general location where we may be able to
80 // update an accumulator. We do this efficiently by incorporating skip data in the inverted
83 // Skip data comes in compressed and delta-encoded (document, distance) pairs. A
84 // (document, distance) tuple implies that the posting for document is immediately before
85 // binStart + distance. So, if we are looking for nextDocument, and nextDocument > document,
86 // we can safely skip to binStart + distance.
88 // Our strategy is to decode the skip information slightly ahead of nextDocument, so generally
89 // skipDocument is set to something greater than nextDocument upon return. This helps reduce
90 // the overhead of skip processing in the event that there are lots of accumulators.
91 // Notice that the function begins with a quick check to see if the skip data will be useful,
92 // and it returns if it won't be useful.
95 static inline const char* _moveToDocument( const UINT32 nextDocument
, UINT32
& currentDocument
,
96 const char* binStart
, const char* binData
,
97 UINT32
& skipDocument
, size_t& skipDistance
,
98 const char*& skipData
, const char* skipEnd
) {
99 // if the next skip location isn't available, or if it is further
100 // than we need to go, quit now.
101 if( nextDocument
<= skipDocument
)
104 #ifndef GALAGO_NO_SKIPS
105 if( skipData
== skipEnd
)
108 skipDocument
= MAX_UINT32
;
112 // we know there is more skip information, so we'll decode it
113 while ( skipData
< skipEnd
) {
117 // decode a (document, distance) pair
118 skipData
= lemur::utility::RVLCompress::decompress_int( skipData
, docDelta
);
119 skipData
= lemur::utility::RVLCompress::decompress_int( skipData
, distDelta
);
120 skipDocument
+= docDelta
;
121 skipDistance
+= distDelta
;
123 // if this latest skip information is more than we need, quit,
124 // while storing the next skip point in (skipDocument, skipDistance).
125 if( nextDocument
<= skipDocument
) {
126 const char* lastSkip
= binStart
+ skipDistance
- distDelta
;
128 if( lastSkip
< binData
) {
131 currentDocument
= skipDocument
- docDelta
;
137 // we're all done with skip data, so go to the point indicated by the last skip
138 if( binData
> binStart
+ skipDistance
) {
139 skipDocument
= MAX_UINT32
;
143 currentDocument
= skipDocument
;
144 skipDocument
= MAX_UINT32
;
145 return binStart
+ skipDistance
;
152 void _newScore( UINT32 score
) {
153 if( _scoreCounts
.size() < score
+1 ) {
154 _scoreCounts
.resize( score
+1, 0 );
157 _scoreCounts
[score
]++;
164 void _updateScore( UINT32 oldScore
, UINT32 newScore
) {
165 if( _scoreCounts
.size() < newScore
+1 ) {
166 _scoreCounts
.resize( newScore
+1, 0 );
169 _scoreCounts
[oldScore
]--;
170 _scoreCounts
[newScore
]++;
177 void _swapAccumulators() {
178 if( &_accumulators
== &_first
) {
179 _accumulators
= _second
;
180 _newAccumulators
= _first
;
182 _accumulators
= _first
;
183 _newAccumulators
= _second
;
186 _newAccumulators
.clear();
190 ArrayAccumulators( int termCount
) :
191 _accumulators(_first
),
192 _newAccumulators(_second
)
194 int actual
= std::min( termCount
, _AccumulatorType::maxTerms() );
196 _canIgnorePostings
= false;
197 _maxUnseen
.resize( actual
, _AccumulatorType::maxScore() );
206 void setThresholdScore( int threshold
) {
207 _threshold
= std::max( threshold
, _threshold
);
214 int threshold() const {
223 return totalUnseen() < _threshold
;
230 void processBin( int score
, BinOrderedBinnedIterator
* iterator
) {
231 processBinOr( score
, -1, iterator
);
238 void processBin( int score
, int termIndex
, BinOrderedBinnedIterator
* iterator
) {
240 processBinAnd( score
, termIndex
, iterator
);
242 processBinOr( score
, termIndex
, iterator
);
250 void processBinOr( int score
, int termIndex
, BinOrderedBinnedIterator
* iterator
) {
252 galago_log_bin_start( log
, GALAGO_MODE_OR
, score
, termIndex
, iterator
->postingsDataLength(), iterator
->skipDataLength(), _accumulators
.size(), _threshold
);
254 assert( termIndex
>= 0 );
256 update_type updateValue
= _AccumulatorType::buildUpdate( termIndex
, score
);
257 UINT32 nextAccumulator
= 0;
259 _moveNextOrAccumulator( _accumulators
, nextAccumulator
, accIndex
);
260 _newAccumulators
.reserve(_accumulators
.size() + iterator
->binDataLength() / 2 );
262 const char* binData
= iterator
->binData();
263 const char* binEnd
= iterator
->binData() + iterator
->binDataLength();
266 while( binData
< binEnd
) {
267 // decompress the next posting
269 binData
= lemur::utility::RVLCompress::decompress_int( binData
, delta
);
272 galago_log_posting( log
);
274 // copy all accumulators that come before the current document
275 while( nextAccumulator
< document
) {
276 _newAccumulators
.push_back( _accumulators
[accIndex
] );
277 _moveNextOrAccumulator( _accumulators
, nextAccumulator
, accIndex
);
280 // if this document already appears in the accumulators, update it
281 if( nextAccumulator
== document
) {
282 _AccumulatorType
& accumulator
= _accumulators
[accIndex
];
283 UINT32 oldScore
= accumulator
.score();
284 _updateScore( oldScore
, oldScore
+ score
);
286 accumulator
.update( updateValue
);
287 _newAccumulators
.push_back( accumulator
);
288 _moveNextOrAccumulator( _accumulators
, nextAccumulator
, accIndex
);
290 galago_log_accumulator( log
, document
, oldScore
+ score
);
292 // otherwise, build a new accumulator for the new document
293 _AccumulatorType
newAccumulator( document
, updateValue
);
294 _newAccumulators
.push_back( newAccumulator
);
297 galago_log_accumulator( log
, document
, score
);
301 // some old accumulators may be left; copy those too
302 std::copy( _accumulators
.begin() + accIndex
, _accumulators
.end(), std::back_inserter(_newAccumulators
) );
303 // copy the table back into place
306 galago_log_bin_end( log
, _accumulators
.size() );
313 void processBinAnd( int score
, int termIndex
, BinOrderedBinnedIterator
* iterator
) {
315 galago_log_bin_start( log
, GALAGO_MODE_AND
, score
, termIndex
, iterator
->postingsDataLength(), iterator
->skipDataLength(), _accumulators
.size(), _threshold
);
319 if( _accumulators
.size() == 0 )
322 assert( termIndex
<= _AccumulatorType::maxTerms() );
323 term_type termBit
= _AccumulatorType::buildTerm( termIndex
);
324 update_type updateValue
= _AccumulatorType::buildUpdate( termIndex
, score
);
325 UINT32 nextDocument
= _accumulators
[0].document();
326 size_t skipDistance
= 0;
328 const char* binDataStart
= iterator
->binData();
329 const char* binData
= iterator
->binData();
330 const char* binEnd
= iterator
->binData() + iterator
->binDataLength();
332 const char* skipData
= iterator
->skipData();
333 const char* skipEnd
= skipData
+ iterator
->skipDataLength();
334 UINT32 skipDocument
= 0;
336 // turn off skipping entirely if we have lots of accumulators
337 if( iterator
->skipDataLength()/4 <= _accumulators
.size() ) {
338 skipDocument
= MAX_UINT32
;
343 binData
= _moveToDocument( nextDocument
, document
, binDataStart
, binData
, skipDocument
, skipDistance
, skipData
, skipEnd
);
345 while( binData
< binEnd
) {
346 // decompress the next posting
348 binData
= lemur::utility::RVLCompress::decompress_int( binData
, delta
);
351 galago_log_posting( log
);
353 // if this current document is smaller than the next accumulator, we go to the next one
354 // just ignore it and move on (very common case)
355 if( document
< nextDocument
)
358 // if the current document is after the next accumulator, we
359 // need to move forward to see if current matches a later accumulator
360 while( document
> nextDocument
)
361 _moveNextAndAccumulator( _accumulators
, termBit
, nextDocument
, accIndex
);
363 if( nextDocument
== MAX_UINT32
)
366 // update the accumulator, if this is a hit
367 if( document
== nextDocument
) {
368 _AccumulatorType
& accumulator
= _accumulators
[accIndex
];
369 UINT32 oldScore
= accumulator
.score();
371 accumulator
.update( updateValue
);
372 _updateScore( oldScore
, oldScore
+ score
);
373 galago_log_accumulator( log
, document
, score
);
375 _moveNextAndAccumulator( _accumulators
, termBit
, nextDocument
, accIndex
);
378 // try to move the data pointer forward to the next accumulator
379 binData
= _moveToDocument( nextDocument
, document
, binDataStart
, binData
, skipDocument
, skipDistance
, skipData
, skipEnd
);
382 galago_log_bin_end( log
, _accumulators
.size() );
389 void setMaxUnseen( int termIndex
, int score
) {
390 assert( termIndex
< _termCount
);
391 assert( score
<= _AccumulatorType::maxScore() );
393 _maxUnseen
[termIndex
] = score
;
397 // calculateThresholdScore
399 // The threshold is the n^th largest score in the accumulator table (where n = requested).
400 // Any score in the final ranked list will need to be above this computed threshold value.
403 void calculateThresholdScore( int requested
) {
407 for( i
= _scoreCounts
.size()-1; i
>= 0; i
-- ) {
408 countsSeen
+= _scoreCounts
[i
];
410 if( countsSeen
>= requested
)
414 #ifndef GALAGO_NO_AND_MODE
418 setThresholdScore( i
);
429 if( _threshold
< 0 || _needsTrim
== false )
432 #ifdef GALAGO_NO_TRIM
436 _AccumulatorType::computeUnseenScoreArray( _fullUnseen
, _maxUnseen
);
438 #ifdef GALAGO_TRIM_ONCE
439 for( int i
=0; i
<_accumulators
.size(); i
++ ) {
440 const _AccumulatorType
& accumulator
= _accumulators
[i
];
441 UINT32 score
= accumulator
.score();
443 if( score
>= _threshold
)
446 UINT32 seenTerms
= accumulator
.terms();
447 UINT32 maxValue
= score
+ accumulator
.unseen( _fullUnseen
);
449 // can't trim until no low scoring document would make it in the topk
450 if ( maxValue
>= _threshold
)
454 _newAccumulators
.reserve( _accumulators
.size() );
456 // for each accumulator, we compute the max possible value and trim if it can't be achieved
457 for( int i
=0; i
<_accumulators
.size(); i
++ ) {
458 const _AccumulatorType
& accumulator
= _accumulators
[i
];
459 UINT32 maxValue
= accumulator
.score() + accumulator
.unseen( _fullUnseen
);
461 if ( maxValue
>= _threshold
) {
462 _newAccumulators
.push_back( _accumulators
[i
] );
475 return _accumulators
.size();
485 for( int i
=0; i
< _maxUnseen
.size(); i
++ ) {
486 totalUnseen
+= _maxUnseen
[i
];
493 // canIgnoreFuturePostings
495 // Checks to see if additional postings could possibly change the
496 // top ranks; returns true if no additional postings are needed.
499 bool canIgnoreFuturePostings( int requested
) {
500 #ifdef GALAGO_NO_IGNORE
504 // if we're not in AND mode yet, any new posting could change
505 // the top of the ranked list
506 if( useAndMode() == false )
509 // if we haven't trimmed the accumulators down very much,
510 // it's probably not time to try this yet.
511 if( _accumulators
.size() > 3*requested
)
514 // if we already determined that we can, there's no use
516 if( _canIgnorePostings
)
519 // decode accumulators
520 std::vector
<_AccumulatorType
> partials
;
521 _AccumulatorType::computeUnseenScoreArray( _fullUnseen
, _maxUnseen
);
523 for( int i
=0; i
< _accumulators
.size(); i
++ ) {
524 const _AccumulatorType
& accumulator
= _accumulators
[i
];
525 UINT32 maxValue
= accumulator
.score() + accumulator
.unseen( _fullUnseen
);
527 if ( maxValue
< _threshold
)
530 partials
.push_back( accumulator
);
533 if( partials
.size() == 0 )
536 std::sort( partials
.begin(), partials
.end() );
538 for( int i
=0; i
<partials
.size()-1; i
++ ) {
539 const _AccumulatorType
& one
= partials
[i
];
540 const _AccumulatorType
& two
= partials
[i
+1];
542 UINT32 oneLimit
= one
.score() + one
.unseen( _fullUnseen
);
544 // we already know that one.score < two.score, so
545 // if oneLimit > two.score, one might surpass two
546 // with additional postings
547 if ( oneLimit
> two
.score() )
550 // if the two scores are different, but additional
551 // information might make them the same, we need to
552 // get additional information
553 if ( one
.score() < two
.score() && oneLimit
>= two
.score() )
557 _canIgnorePostings
= true;
565 std::vector
<indri::api::ScoredExtentResult
> topResults( int requested
) {
566 // find the top accumulators in the list
567 std::priority_queue
<indri::api::ScoredExtentResult
, std::vector
<indri::api::ScoredExtentResult
>, indri::api::ScoredExtentResult::score_greater
> scores
;
570 for( i
= 0; i
< std::min
<size_t>( requested
, _accumulators
.size() ); i
++ ) {
571 int document
= _accumulators
[i
].document();
572 int score
= _accumulators
[i
].score();
574 scores
.push( indri::api::ScoredExtentResult( score
, document
, 0, 0 ) );
577 UINT32 lowScore
= (UINT32
) scores
.top().score
;
579 for( ; i
< _accumulators
.size(); i
++ ) {
580 int document
= _accumulators
[i
].document();
581 int score
= _accumulators
[i
].score();
583 if( lowScore
< score
) {
584 scores
.push( indri::api::ScoredExtentResult( score
, document
, 0, 0 ) );
586 lowScore
= (UINT32
) scores
.top().score
;
590 std::vector
<indri::api::ScoredExtentResult
> results
;
591 results
.resize( scores
.size() );
593 for( i
=scores
.size()-1; i
>=0; i
-- ) {
594 results
[i
] = scores
.top();
605 int maxTerms() const {
606 return _AccumulatorType::maxTerms();
610 #endif // GALAGO_ARRAYACCUMULATORS_HPP