3 // BinOrderedBinnedRetrieval
5 // 5 January 2007 -- tds
8 #include "BinOrderedBinnedRetrieval.hpp"
13 #include "LongAccumulator.hpp"
14 #include "WideAccumulator.hpp"
15 #include "ArrayAccumulators.hpp"
23 void BinOrderedBinnedRetrieval::openRead( const std::string
& indexPath
) {
24 _index
.openRead( indexPath
);
31 void BinOrderedBinnedRetrieval::close() {
39 void BinOrderedBinnedRetrieval::_fetchIterators( std::vector
<ScoredIterator
*>& iterators
, const std::vector
<QueryTerm
>& queryTerms
) {
40 for( int i
=0; i
<queryTerms
.size(); i
++ ) {
41 const QueryTerm
& term
= queryTerms
[i
];
42 BinOrderedBinnedIterator
* iterator
= _index
.getIterator( term
.field
, term
.text
);
43 ScoredIterator
* scoredIterator
= new ScoredIterator( iterator
);
45 if( iterator
!= 0 && iterator
->next() ) {
46 iterators
.push_back(scoredIterator
);
53 // _processExcessTerms
56 void BinOrderedBinnedRetrieval::_processExcessTerms( std::vector
<ScoredIterator
*>& iterators
, Accumulators
& accumulators
) {
57 if( accumulators
.maxTerms() < iterators
.size() ) {
58 std::sort( iterators
.begin(), iterators
.end(), ScoredIterator::order() );
60 // process the terms that have the highest scores (and the shortest inverted lists)
61 for( int i
= accumulators
.maxTerms(); i
< iterators
.size(); i
++ ) {
62 ScoredIterator
* iter
= iterators
[i
];
65 int score
= iter
->iterator
->bin() * iter
->weight
;
66 accumulators
.processBin( score
, iter
->iterator
);
67 } while( iter
->iterator
->next() );
72 // remove all the terms we've already processed
73 iterators
.resize( accumulators
.maxTerms() );
81 void BinOrderedBinnedRetrieval::_loadQueue( siter_queue
& queue
, std::vector
<ScoredIterator
*>& iterators
, Accumulators
& accumulators
) {
82 // set up the bin weights, maxScore for each iterator that remains
83 for( int i
=0; i
<iterators
.size(); i
++ ) {
84 ScoredIterator
* iter
= iterators
[i
];
86 UINT32 maxScore
= iter
->iterator
->bin() * iter
->weight
;
87 iter
->maxScore
= maxScore
;
89 accumulators
.setMaxUnseen( i
, maxScore
);
98 std::vector
<indri::api::ScoredExtentResult
>
99 BinOrderedBinnedRetrieval::runQuery( const std::vector
<QueryTerm
>& inputTerms
,
102 std::vector
<ScoredIterator
*> iterators
;
104 int maximumAccumulators
= 0;
106 // stem/stop query terms
107 std::vector
<QueryTerm
> queryTerms
= _index
.processTerms( inputTerms
);
109 // fetch iterators for the query terms
110 _fetchIterators( iterators
, queryTerms
);
111 galago_log_query_terms( iterators
.size() );
113 // if we're using Anh/Moffat impact indexes, we need to calculate
114 // extra weights at query time.
115 if( _index
.requiresWeighting() )
116 _calculateQueryWeights( _index
, iterators
, _index
.bins() );
117 _assignMaxScore( iterators
);
119 // buid an accumulator table that's appropriate for the query length
120 Accumulators
* accumulators
= 0;
121 if( iterators
.size() <= LongAccumulator::maxTerms() ) {
122 accumulators
= new ArrayAccumulators
<LongAccumulator
>( iterators
.size() );
124 accumulators
= new ArrayAccumulators
<WideAccumulator
>( iterators
.size() );
127 // the standard accumulators table can only handle a few terms, so we have
128 // to process any excess terms first
129 _processExcessTerms( iterators
, *accumulators
);
130 _loadQueue( queue
, iterators
, *accumulators
);
132 if( threshold
>= 0 ) {
133 accumulators
->setThresholdScore( threshold
);
138 // for all remaining terms, we enter the optimized loop
139 while( queue
.size() > 0 ) {
140 ScoredIterator
* iter
= queue
.top();
143 // if we can quit now, do it
144 if( accumulators
->canIgnoreFuturePostings( requested
) ) {
147 while( queue
.size() > 0 ) {
155 int score
= iter
->iterator
->bin() * iter
->weight
;
156 accumulators
->processBin( score
, iter
->index
, iter
->iterator
);
157 bool hasMore
= iter
->iterator
->next();
160 _assignMaxScore( iter
);
161 accumulators
->setMaxUnseen( iter
->index
, iter
->maxScore
);
164 accumulators
->setMaxUnseen( iter
->index
, 0 );
168 accumulators
->calculateThresholdScore( requested
);
169 maximumAccumulators
= std::max( maximumAccumulators
, accumulators
->size() );
170 accumulators
->trim();
173 std::vector
<indri::api::ScoredExtentResult
> results
= accumulators
->topResults( requested
);
182 std::string
BinOrderedBinnedRetrieval::getDocument( int doc
) {
183 return _index
.getDocument(doc
);
187 // _calculateQueryWeights
190 void BinOrderedBinnedRetrieval::_calculateQueryWeights( const BinnedIndex
& index
, std::vector
<ScoredIterator
*>& scored
, int binCount
) {
191 // compute the query weights; totally ad hoc approximation here
193 // 1. determine the maximum count
194 UINT64 maxCount
= index
.maximumDocumentCount();
196 // 2. compute these log value things
197 indri::utility::greedy_vector
<double> logWeights
;
198 double maxWeight
= 0;
199 for( int i
=0; i
<scored
.size(); i
++ ) {
200 double logWeight
= log( 1 + (double)maxCount
/ (double)scored
[i
]->iterator
->documentCount() );
201 logWeights
.push_back( logWeight
);
202 maxWeight
= std::max( maxWeight
, logWeight
);
205 // 3. bin them to the bin levels
206 for( int i
=0; i
<scored
.size(); i
++ ) {
207 double logWeight
= logWeights
[i
];
208 int binned
= (int) (binCount
* logWeight
/ maxWeight
);
209 scored
[i
]->weight
*= std::max( 1, binned
);
217 void BinOrderedBinnedRetrieval::_assignMaxScore( std::vector
<ScoredIterator
*>& scored
) {
218 for( int i
=0; i
<scored
.size(); i
++ ) {
219 _assignMaxScore( scored
[i
] );
227 void BinOrderedBinnedRetrieval::_assignMaxScore( ScoredIterator
* scored
) {
228 scored
->maxScore
= scored
->weight
* scored
->iterator
->bin();