cpp/galago/include/WideAccumulator.hpp

   1
   2 //
   3 // WideAccumulator
   4 //
   5 // 23 February 2007 -- tds
   6 //
   7
   8 #ifndef GALAGO_WIDEACCUMULATOR_HPP
   9 #define GALAGO_WIDEACCUMULATOR_HPP
  10
  11 #define WA_TERM_BITS        (64)
  12 #define WA_TERM_MAX_SCORE   (MAX_INT32)
  13
  14 class WideAccumulator {
  15 private:
  16     UINT32 _document;
  17     UINT32 _score;
  18     UINT64 _terms;
  19
  20 public:
  21     typedef WideAccumulator update_type;
  22     typedef UINT64 term_type;
  23
  24     WideAccumulator() :
  25         _document(0), _score(0), _terms(0)
  26     {
  27     }
  28
  29     WideAccumulator( const UINT32 document, const UINT32 terms, const UINT64 score ) {
  30         _document = document;
  31         _terms = terms;
  32         _score = score;
  33     }
  34
  35     WideAccumulator( const UINT32 document, const update_type update ) {
  36         _document = document;
  37         _terms = update.terms();
  38         _score = update.score();
  39     }
  40
  41     bool operator< ( const WideAccumulator& other ) const {
  42         return score() < other.score();
  43     }
  44
  45     inline UINT32 document() const {
  46         return _document;
  47     }
  48
  49     inline UINT32 score() const {
  50         return _score;
  51     }
  52
  53     inline UINT32 terms() const {
  54         return _terms;
  55     }
  56
  57     inline void update( const update_type update ) {
  58         _terms |= update.terms();
  59         _score += update.score();
  60     }
  61
  62     inline UINT32 unseen( const std::vector<UINT32>& fullUnseen ) const {
  63         UINT32 result = 0;
  64
  65         for( int b=0; b<8; b++ ) {
  66             UINT32 bits = (_terms >> (b*8)) & 0xFF;
  67             result += fullUnseen[ (b<<8) | bits ];
  68         }
  69
  70         return result;
  71     }
  72
  73     inline bool containsTerm( const term_type termBit ) const {
  74         return (termBit & _terms) ? true : false;
  75     }
  76
  77     static int maxScore() {
  78         return WA_TERM_MAX_SCORE;
  79     }
  80
  81     static int maxTerms() {
  82         return WA_TERM_BITS;
  83     }
  84
  85     static term_type buildTerm( const int termIndex ) {
  86         return (1ULL<<termIndex);
  87     }
  88
  89     static term_type buildNullTerm() {
  90         return 0;
  91     }
  92
  93     static update_type buildUpdate( const int termIndex, const UINT32 score ) {
  94         WideAccumulator result;
  95         result._terms = buildTerm( termIndex );
  96         result._score = score;
  97         return result;
  98     }
  99
 100     static void computeUnseenScoreArray( std::vector<UINT32>& fullUnseen, const std::vector<UINT32>& maxUnseen ) {
 101         // fullUnseen is a precomputed array that helps make our job easier
 102         // when computing unseen statistics.
 103         // For WideAccumulators, we use 2048 (256*8) entries.
 104         // The low 8-bits of the index into this array is a bitmap.  The upper 3 bits
 105         // are a byte index.  This code probably makes more sense if you look at the unseen method too.
 106
 107         fullUnseen.resize( 2048, 0 );
 108
 109         // b selects one byte from the terms bitmap
 110         for( UINT32 b = 0; b < 8; b++ ) {
 111             // bits iterates over all possible settings of 8 bits
 112             for( UINT32 bits = 0; bits < 256; bits++ ) {
 113                 fullUnseen[ (b<<8) | bits ] = 0;
 114
 115                 for( UINT32 bitIndex = 0; bitIndex < 8; bitIndex++ ) {
 116                     UINT32 bit = 1<<bitIndex;
 117                     UINT32 term = bitIndex + b*8;
 118
 119                     if( (bits & bit) == 0 && term < maxUnseen.size() ) {
 120                         fullUnseen[ (b<<8) | bits ] += maxUnseen[ term ];
 121                     }
 122                 }
 123             }
 124         }
 125     }
 126
 127 };
 128
 129 template<>
 130 class AccumulatorTypes<WideAccumulator> {
 131 public:
 132     typedef WideAccumulator update_type;
 133     typedef UINT64 term_type;
 134 };
 135
 136 #endif // GALAGO_LONGACCUMULATOR_HPP
 137
 138
 139