cpp/galago/include/ArrayAccumulators.hpp

   1 //
   2 // ArrayAccumulators
   3 //
   4 // 5 January 2007 -- tds
   5 //
   6
   7 #include "LongArrayAccumulators.hpp"
   8 #include "indri/indri-platform.h"
   9 #include "lemur/RVLCompress.hpp"
  10 #include "Logging.hpp"
  11 #include "Accumulators.hpp"
  12 #include "Bits.hpp"
  13 #include <vector>
  14 #include <string>
  15 #include <queue>
  16
  17 #ifndef GALAGO_ARRAYACCUMULATORS_HPP
  18 #define GALAGO_ARRAYACCUMULATORS_HPP
  19
  20 template<class _AccumulatorType>
  21 class ArrayAccumulators : public Accumulators {
  22 public:
  23     typedef typename AccumulatorTypes<_AccumulatorType>::term_type   term_type;
  24     typedef typename AccumulatorTypes<_AccumulatorType>::update_type update_type;
  25
  26 private:
  27     std::vector<_AccumulatorType> _first;
  28     std::vector<_AccumulatorType> _second;
  29     std::vector<_AccumulatorType>& _accumulators;
  30     std::vector<_AccumulatorType>& _newAccumulators;
  31
  32     std::vector<UINT32> _maxUnseen;
  33     std::vector<UINT32> _fullUnseen;
  34     std::vector<UINT32> _scoreCounts;
  35
  36     int _termCount;
  37     int _threshold;
  38     bool _needsTrim;
  39     bool _canIgnorePostings;
  40
  41     static inline void _moveNextOrAccumulator( const std::vector<_AccumulatorType>& accumulators,
  42                                                UINT32& nextAccumulator,
  43                                                int& index ) {
  44         index++;
  45
  46         if( index == accumulators.size() ) {
  47             nextAccumulator = MAX_UINT32;
  48         } else {
  49             nextAccumulator = accumulators[index].document();
  50         }
  51     }
  52
  53     static inline void _moveNextAndAccumulator( const std::vector<_AccumulatorType>& accumulators,
  54                                                 term_type termBit,
  55                                                 UINT32& nextAccumulator,
  56                                                 int& index ) {
  57         while (true) {
  58             index++;
  59
  60             if( index == accumulators.size() ) {
  61                 nextAccumulator = MAX_UINT32;
  62                 break;
  63             } else {
  64                 const _AccumulatorType& accumulator = accumulators[index];
  65
  66                 if( accumulator.containsTerm( termBit ) ) {
  67                     continue;
  68                 } else {
  69                     nextAccumulator = accumulator.document();
  70                     break;
  71                 }
  72             }
  73         }
  74     }
  75
  76     //
  77     // move_to_document
  78     //
  79     // Move forward in the inverted list to the next general location where we may be able to
  80     // update an accumulator.  We do this efficiently by incorporating skip data in the inverted
  81     // list.
  82     //
  83     // Skip data comes in compressed and delta-encoded (document, distance) pairs.  A
  84     // (document, distance) tuple implies that the posting for document is immediately before
  85     // binStart + distance.  So, if we are looking for nextDocument, and nextDocument > document,
  86     // we can safely skip to binStart + distance.
  87     //
  88     // Our strategy is to decode the skip information slightly ahead of nextDocument, so generally
  89     // skipDocument is set to something greater than nextDocument upon return.  This helps reduce
  90     // the overhead of skip processing in the event that there are lots of accumulators.
  91     // Notice that the function begins with a quick check to see if the skip data will be useful,
  92     // and it returns if it won't be useful.
  93     //
  94
  95     static inline const char* _moveToDocument( const UINT32 nextDocument, UINT32& currentDocument,
  96                                                const char* binStart, const char* binData,
  97                                                UINT32& skipDocument, size_t& skipDistance,
  98                                                const char*& skipData, const char* skipEnd ) {
  99         // if the next skip location isn't available, or if it is further
 100         // than we need to go, quit now.
 101         if( nextDocument <= skipDocument )
 102             return binData;
 103
 104     #ifndef GALAGO_NO_SKIPS
 105         if( skipData == skipEnd )
 106     #endif
 107         {
 108             skipDocument = MAX_UINT32;
 109             return binData;
 110         }
 111
 112         // we know there is more skip information, so we'll decode it
 113         while ( skipData < skipEnd ) {
 114             int docDelta;
 115             int distDelta;
 116
 117             // decode a (document, distance) pair
 118             skipData = lemur::utility::RVLCompress::decompress_int( skipData, docDelta );
 119             skipData = lemur::utility::RVLCompress::decompress_int( skipData, distDelta );
 120             skipDocument += docDelta;
 121             skipDistance += distDelta;
 122
 123             // if this latest skip information is more than we need, quit,
 124             // while storing the next skip point in (skipDocument, skipDistance).
 125             if( nextDocument <= skipDocument ) {
 126                 const char* lastSkip = binStart + skipDistance - distDelta;
 127
 128                 if( lastSkip < binData ) {
 129                     return binData;
 130                 } else {
 131                     currentDocument = skipDocument - docDelta;
 132                     return lastSkip;
 133                 }
 134             }
 135         }
 136
 137         // we're all done with skip data, so go to the point indicated by the last skip
 138         if( binData > binStart + skipDistance ) {
 139             skipDocument = MAX_UINT32;
 140             return binData;
 141         }
 142
 143         currentDocument = skipDocument;
 144         skipDocument = MAX_UINT32;
 145         return binStart + skipDistance;
 146     }
 147
 148     //
 149     // _newScore
 150     //
 151
 152     void _newScore( UINT32 score ) {
 153         if( _scoreCounts.size() < score+1 ) {
 154             _scoreCounts.resize( score+1, 0 );
 155         }
 156
 157         _scoreCounts[score]++;
 158     }
 159
 160     //
 161     // _updateScore
 162     //
 163
 164     void _updateScore( UINT32 oldScore, UINT32 newScore ) {
 165         if( _scoreCounts.size() < newScore+1 ) {
 166             _scoreCounts.resize( newScore+1, 0 );
 167         }
 168
 169         _scoreCounts[oldScore]--;
 170         _scoreCounts[newScore]++;
 171     }
 172
 173     //
 174     // _swapAccumulators
 175     //
 176
 177     void _swapAccumulators() {
 178         if( &_accumulators == &_first ) {
 179             _accumulators = _second;
 180             _newAccumulators = _first;
 181         } else {
 182             _accumulators = _first;
 183             _newAccumulators = _second;
 184         }
 185
 186         _newAccumulators.clear();
 187     }
 188
 189 public:
 190     ArrayAccumulators( int termCount ) :
 191         _accumulators(_first),
 192         _newAccumulators(_second)
 193     {
 194         int actual = std::min( termCount, _AccumulatorType::maxTerms() );
 195         _threshold = -1;
 196         _canIgnorePostings = false;
 197         _maxUnseen.resize( actual, _AccumulatorType::maxScore() );
 198         _termCount = actual;
 199         _needsTrim = false;
 200     }
 201
 202     //
 203     // setThresholdScore
 204     //
 205
 206     void setThresholdScore( int threshold ) {
 207         _threshold = std::max( threshold, _threshold );
 208     }
 209
 210     //
 211     // threshold
 212     //
 213
 214     int threshold() const {
 215         return _threshold;
 216     }
 217
 218     //
 219     // useAndMode
 220     //
 221
 222     bool useAndMode() {
 223         return totalUnseen() < _threshold;
 224     }
 225
 226     //
 227     // processBin
 228     //
 229
 230     void processBin( int score, BinOrderedBinnedIterator* iterator ) {
 231         processBinOr( score, -1, iterator );
 232     }
 233
 234     //
 235     // processBin
 236     //
 237
 238     void processBin( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
 239         if( useAndMode() ) {
 240             processBinAnd( score, termIndex, iterator );
 241         } else {
 242             processBinOr( score, termIndex, iterator );
 243         }
 244     }
 245
 246     //
 247     // processBinOr
 248     //
 249
 250     void processBinOr( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
 251         galago_logging log;
 252         galago_log_bin_start( log, GALAGO_MODE_OR, score, termIndex, iterator->postingsDataLength(), iterator->skipDataLength(), _accumulators.size(), _threshold );
 253
 254         assert( termIndex >= 0 );
 255         int accIndex=-1;
 256         update_type updateValue = _AccumulatorType::buildUpdate( termIndex, score );
 257         UINT32 nextAccumulator = 0;
 258
 259         _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
 260         _newAccumulators.reserve(_accumulators.size() + iterator->binDataLength() / 2 );
 261
 262         const char* binData = iterator->binData();
 263         const char* binEnd = iterator->binData() + iterator->binDataLength();
 264         int document = 0;
 265
 266         while( binData < binEnd ) {
 267             // decompress the next posting
 268             int delta = 0;
 269             binData = lemur::utility::RVLCompress::decompress_int( binData, delta );
 270             document += delta;
 271
 272             galago_log_posting( log );
 273
 274             // copy all accumulators that come before the current document
 275             while( nextAccumulator < document ) {
 276                 _newAccumulators.push_back( _accumulators[accIndex] );
 277                 _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
 278             }
 279
 280             // if this document already appears in the accumulators, update it
 281             if( nextAccumulator == document ) {
 282                 _AccumulatorType& accumulator = _accumulators[accIndex];
 283                 UINT32 oldScore = accumulator.score();
 284                 _updateScore( oldScore, oldScore + score );
 285
 286                 accumulator.update( updateValue );
 287                 _newAccumulators.push_back( accumulator );
 288                 _moveNextOrAccumulator( _accumulators, nextAccumulator, accIndex );
 289
 290                 galago_log_accumulator( log, document, oldScore + score );
 291             } else {
 292                 // otherwise, build a new accumulator for the new document
 293                 _AccumulatorType newAccumulator( document, updateValue );
 294                 _newAccumulators.push_back( newAccumulator );
 295                 _newScore( score );
 296
 297                 galago_log_accumulator( log, document, score );
 298             }
 299         }
 300
 301         // some old accumulators may be left; copy those too
 302         std::copy( _accumulators.begin() + accIndex, _accumulators.end(), std::back_inserter(_newAccumulators) );
 303         // copy the table back into place
 304         _swapAccumulators();
 305
 306         galago_log_bin_end( log, _accumulators.size() );
 307     }
 308
 309     //
 310     // processBinAnd
 311     //
 312
 313     void processBinAnd( int score, int termIndex, BinOrderedBinnedIterator* iterator ) {
 314         galago_logging log;
 315         galago_log_bin_start( log, GALAGO_MODE_AND, score, termIndex, iterator->postingsDataLength(), iterator->skipDataLength(), _accumulators.size(), _threshold );
 316
 317         int accIndex=0;
 318
 319         if( _accumulators.size() == 0 )
 320             return;
 321
 322         assert( termIndex <= _AccumulatorType::maxTerms() );
 323         term_type termBit = _AccumulatorType::buildTerm( termIndex );
 324         update_type updateValue = _AccumulatorType::buildUpdate( termIndex, score );
 325         UINT32 nextDocument = _accumulators[0].document();
 326         size_t skipDistance = 0;
 327
 328         const char* binDataStart = iterator->binData();
 329         const char* binData = iterator->binData();
 330         const char* binEnd = iterator->binData() + iterator->binDataLength();
 331
 332         const char* skipData = iterator->skipData();
 333         const char* skipEnd = skipData + iterator->skipDataLength();
 334         UINT32 skipDocument = 0;
 335
 336         // turn off skipping entirely if we have lots of accumulators
 337         if( iterator->skipDataLength()/4 <= _accumulators.size() ) {
 338             skipDocument = MAX_UINT32;
 339             skipData = skipEnd;
 340         }
 341
 342         UINT32 document = 0;
 343         binData = _moveToDocument( nextDocument, document, binDataStart, binData, skipDocument, skipDistance, skipData, skipEnd );
 344
 345         while( binData < binEnd ) {
 346             // decompress the next posting
 347             int delta = 0;
 348             binData = lemur::utility::RVLCompress::decompress_int( binData, delta );
 349             document += delta;
 350
 351             galago_log_posting( log );
 352
 353             // if this current document is smaller than the next accumulator, we go to the next one
 354             // just ignore it and move on (very common case)
 355             if( document < nextDocument )
 356                 continue;
 357
 358             // if the current document is after the next accumulator, we
 359             // need to move forward to see if current matches a later accumulator
 360             while( document > nextDocument )
 361                 _moveNextAndAccumulator( _accumulators, termBit, nextDocument, accIndex );
 362
 363             if( nextDocument == MAX_UINT32 )
 364                 break;
 365
 366             // update the accumulator, if this is a hit
 367             if( document == nextDocument ) {
 368                 _AccumulatorType& accumulator = _accumulators[accIndex];
 369                 UINT32 oldScore = accumulator.score();
 370
 371                 accumulator.update( updateValue );
 372                 _updateScore( oldScore, oldScore + score );
 373                 galago_log_accumulator( log, document, score );
 374
 375                 _moveNextAndAccumulator( _accumulators, termBit, nextDocument, accIndex );
 376             }
 377
 378             // try to move the data pointer forward to the next accumulator
 379             binData = _moveToDocument( nextDocument, document, binDataStart, binData, skipDocument, skipDistance, skipData, skipEnd );
 380         }
 381
 382         galago_log_bin_end( log, _accumulators.size() );
 383     }
 384
 385     //
 386     // setMaxUnseen
 387     //
 388
 389     void setMaxUnseen( int termIndex, int score ) {
 390         assert( termIndex < _termCount );
 391         assert( score <= _AccumulatorType::maxScore() );
 392
 393         _maxUnseen[termIndex] = score;
 394     }
 395
 396     //
 397     // calculateThresholdScore
 398     //
 399     // The threshold is the n^th largest score in the accumulator table (where n = requested).
 400     // Any score in the final ranked list will need to be above this computed threshold value.
 401     //
 402
 403     void calculateThresholdScore( int requested ) {
 404         int countsSeen = 0;
 405         int i;
 406
 407         for( i = _scoreCounts.size()-1; i >= 0; i-- ) {
 408             countsSeen += _scoreCounts[i];
 409
 410             if( countsSeen >= requested )
 411                 break;
 412         }
 413
 414     #ifndef GALAGO_NO_AND_MODE
 415         if( i > 0 )
 416             _needsTrim = true;
 417
 418         setThresholdScore( i );
 419     #else
 420         _threshold = -1;
 421     #endif
 422     }
 423
 424     //
 425     // trim
 426     //
 427
 428     void trim() {
 429         if( _threshold < 0 || _needsTrim == false )
 430             return;
 431
 432     #ifdef GALAGO_NO_TRIM
 433         return;
 434     #endif
 435
 436         _AccumulatorType::computeUnseenScoreArray( _fullUnseen, _maxUnseen );
 437
 438     #ifdef GALAGO_TRIM_ONCE
 439         for( int i=0; i<_accumulators.size(); i++ ) {
 440             const _AccumulatorType& accumulator = _accumulators[i];
 441             UINT32 score = accumulator.score();
 442
 443             if( score >= _threshold )
 444                 continue;
 445
 446             UINT32 seenTerms = accumulator.terms();
 447             UINT32 maxValue = score + accumulator.unseen( _fullUnseen );
 448
 449             // can't trim until no low scoring document would make it in the topk
 450             if ( maxValue >= _threshold )
 451                 return;
 452         }
 453     #endif
 454         _newAccumulators.reserve( _accumulators.size() );
 455
 456         // for each accumulator, we compute the max possible value and trim if it can't be achieved
 457         for( int i=0; i<_accumulators.size(); i++ ) {
 458             const _AccumulatorType& accumulator = _accumulators[i];
 459             UINT32 maxValue = accumulator.score() + accumulator.unseen( _fullUnseen );
 460
 461             if ( maxValue >= _threshold ) {
 462                 _newAccumulators.push_back( _accumulators[i] );
 463             }
 464         }
 465
 466         _swapAccumulators();
 467         _needsTrim = false;
 468     }
 469
 470     //
 471     // size
 472     //
 473
 474     int size() {
 475         return _accumulators.size();
 476     }
 477
 478     //
 479     // totalUnseen
 480     //
 481
 482     int totalUnseen() {
 483         int totalUnseen = 0;
 484
 485         for( int i=0; i < _maxUnseen.size(); i++ ) {
 486             totalUnseen += _maxUnseen[i];
 487         }
 488
 489         return totalUnseen;
 490     }
 491
 492     //
 493     // canIgnoreFuturePostings
 494     //
 495     // Checks to see if additional postings could possibly change the
 496     // top ranks; returns true if no additional postings are needed.
 497     //
 498
 499     bool canIgnoreFuturePostings( int requested ) {
 500     #ifdef GALAGO_NO_IGNORE
 501         return false;
 502     #endif
 503
 504         // if we're not in AND mode yet, any new posting could change
 505         // the top of the ranked list
 506         if( useAndMode() == false )
 507             return false;
 508
 509         // if we haven't trimmed the accumulators down very much,
 510         // it's probably not time to try this yet.
 511         if( _accumulators.size() > 3*requested )
 512             return false;
 513
 514         // if we already determined that we can, there's no use
 515         // checking again
 516         if( _canIgnorePostings )
 517             return true;
 518
 519         // decode accumulators
 520         std::vector<_AccumulatorType> partials;
 521         _AccumulatorType::computeUnseenScoreArray( _fullUnseen, _maxUnseen );
 522
 523         for( int i=0; i < _accumulators.size(); i++ ) {
 524             const _AccumulatorType& accumulator = _accumulators[i];
 525             UINT32 maxValue = accumulator.score() + accumulator.unseen( _fullUnseen );
 526
 527             if ( maxValue < _threshold )
 528                 continue;
 529
 530             partials.push_back( accumulator );
 531         }
 532
 533         if( partials.size() == 0 )
 534             return false;
 535
 536         std::sort( partials.begin(), partials.end() );
 537
 538         for( int i=0; i<partials.size()-1; i++ ) {
 539             const _AccumulatorType& one = partials[i];
 540             const _AccumulatorType& two = partials[i+1];
 541
 542             UINT32 oneLimit = one.score() + one.unseen( _fullUnseen );
 543
 544             // we already know that one.score < two.score, so
 545             // if oneLimit > two.score, one might surpass two
 546             // with additional postings
 547             if ( oneLimit > two.score() )
 548                 return false;
 549
 550             // if the two scores are different, but additional
 551             // information might make them the same, we need to
 552             // get additional information
 553             if ( one.score() < two.score() && oneLimit >= two.score() )
 554                 return false;
 555         }
 556
 557         _canIgnorePostings = true;
 558         return true;
 559     }
 560
 561     //
 562     // topResults
 563     //
 564
 565     std::vector<indri::api::ScoredExtentResult> topResults( int requested ) {
 566         // find the top accumulators in the list
 567         std::priority_queue<indri::api::ScoredExtentResult, std::vector<indri::api::ScoredExtentResult>, indri::api::ScoredExtentResult::score_greater> scores;
 568         int i;
 569
 570         for( i = 0; i < std::min<size_t>( requested, _accumulators.size() ); i++ ) {
 571             int document = _accumulators[i].document();
 572             int score = _accumulators[i].score();
 573
 574             scores.push( indri::api::ScoredExtentResult( score, document, 0, 0 ) );
 575         }
 576
 577         UINT32 lowScore = (UINT32) scores.top().score;
 578
 579         for( ; i < _accumulators.size(); i++ ) {
 580             int document = _accumulators[i].document();
 581             int score = _accumulators[i].score();
 582
 583             if( lowScore < score ) {
 584                 scores.push( indri::api::ScoredExtentResult( score, document, 0, 0 ) );
 585                 scores.pop();
 586                 lowScore = (UINT32) scores.top().score;
 587             }
 588         }
 589
 590         std::vector<indri::api::ScoredExtentResult> results;
 591         results.resize( scores.size() );
 592
 593         for( i=scores.size()-1; i>=0; i-- ) {
 594             results[i] = scores.top();
 595             scores.pop();
 596         }
 597
 598         return results;
 599     }
 600
 601     //
 602     // maxTerms
 603     //
 604
 605     int maxTerms() const {
 606         return _AccumulatorType::maxTerms();
 607     }
 608 };
 609
 610 #endif // GALAGO_ARRAYACCUMULATORS_HPP
 611