2 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of
7 the License, or (at your option) any later version.
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this library; see the file COPYING. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
20 #include "sopranoindexreader.h"
22 #include <strigi/query.h>
23 #include <strigi/queryparser.h>
24 #include <strigi/fieldtypes.h>
27 #include <Soprano/Soprano>
28 #include <Soprano/Index/IndexFilterModel>
29 #include <Soprano/Index/CLuceneIndex>
30 #include <Soprano/Vocabulary/XMLSchema>
38 #include <QtCore/QThread>
39 #include <QtCore/QDateTime>
40 #include <QtCore/QDebug>
41 #include <QtCore/QString>
42 #include <QtCore/QLatin1String>
43 #include <QtCore/QFile>
46 using namespace Soprano
;
49 static lucene::index::Term
* createWildCardTerm( const TString
& name
,
50 const string
& value
);
51 static lucene::index::Term
* createTerm( const TString
& name
,
52 const string
& value
);
53 static lucene::index::Term
* createKeywordTerm( const TString
& name
,
54 const string
& value
);
55 static lucene::search::BooleanQuery
* createBooleanQuery( const Strigi::Query
& query
);
56 static lucene::search::Query
* createQuery( const Strigi::Query
& query
);
57 static lucene::search::Query
* createSimpleQuery( const Strigi::Query
& query
);
58 static lucene::search::Query
* createSingleFieldQuery( const string
& field
,
59 const Strigi::Query
& query
);
60 static lucene::search::Query
* createMultiFieldQuery( const Strigi::Query
& query
);
63 static QString
luceneQueryEscape( const QString
& s
)
65 /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~ : \ */
67 static QRegExp
rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
69 es
.replace( rx
, "\\\\1" );
74 static lucene::index::Term
* createWildCardTerm( const TString
& name
,
77 TString v
= TString::fromUtf8( value
.c_str() );
78 return _CLNEW
lucene::index::Term( name
.data(), v
.data() );
81 static lucene::index::Term
* createTerm( const TString
& name
,
84 qDebug() << "createTerm" << name
<< value
.c_str();
86 TString v
= TString::fromUtf8( value
.c_str() );
88 lucene::util::StringReader
sr( v
.data() );
89 lucene::analysis::standard::StandardAnalyzer a
;
90 lucene::analysis::TokenStream
* ts
= a
.tokenStream(name
.data(), &sr
);
91 lucene::analysis::Token
* to
= ts
->next();
98 lucene::index::Term
* t
= _CLNEW
lucene::index::Term(name
.data(), tv
);
106 static lucene::index::Term
* createKeywordTerm( const TString
& name
,
107 const string
& value
)
109 TString v
= TString::fromUtf8( value
.c_str() );
110 lucene::index::Term
* t
= _CLNEW
lucene::index::Term( name
.data(), v
.data() );
114 static lucene::search::BooleanQuery
* createBooleanQuery( const Strigi::Query
& query
)
116 lucene::search::BooleanQuery
* bq
= _CLNEW
lucene::search::BooleanQuery();
117 bool isAnd
= query
.type() == Strigi::Query::And
;
118 const vector
<Strigi::Query
>& sub
= query
.subQueries();
119 for (vector
<Strigi::Query
>::const_iterator i
= sub
.begin(); i
!= sub
.end(); ++i
) {
120 lucene::search::Query
* q
= createQuery(*i
);
121 bq
->add(q
, true, isAnd
, i
->negate());
126 static lucene::search::Query
* createQuery( const Strigi::Query
& query
)
128 return query
.subQueries().size()
129 ? createBooleanQuery(query
)
130 : createSimpleQuery(query
);
133 static lucene::search::Query
* createSimpleQuery( const Strigi::Query
& query
)
135 switch (query
.fields().size()) {
136 case 0: return createSingleFieldQuery("text", query
);
137 case 1: return createSingleFieldQuery(query
.fields()[0], query
);
138 default: return createMultiFieldQuery(query
);
142 static lucene::search::Query
* createSingleFieldQuery( const string
& field
,
143 const Strigi::Query
& query
) {
144 qDebug() << "Creating single field query: " << field
.c_str();
145 TString fieldname
= Strigi::Soprano::Util::convertSearchField( field
);
146 lucene::search::Query
* q
;
147 lucene::index::Term
* t
;
148 const string
& val
= query
.term().string();
149 switch (query
.type()) {
150 case Strigi::Query::LessThan
:
151 t
= createTerm(fieldname
, val
.c_str());
152 q
= _CLNEW
lucene::search::RangeQuery(0, t
, false);
154 case Strigi::Query::LessThanEquals
:
155 t
= createTerm(fieldname
, query
.term().string());
156 q
= _CLNEW
lucene::search::RangeQuery(0, t
, true);
158 case Strigi::Query::GreaterThan
:
159 t
= createTerm(fieldname
, query
.term().string());
160 q
= _CLNEW
lucene::search::RangeQuery(t
, 0, false);
162 case Strigi::Query::GreaterThanEquals
:
163 t
= createTerm(fieldname
, query
.term().string());
164 q
= _CLNEW
lucene::search::RangeQuery(t
, 0, true);
166 case Strigi::Query::Keyword
:
167 t
= createKeywordTerm(fieldname
, query
.term().string());
168 q
= _CLNEW
lucene::search::TermQuery(t
);
171 if (strpbrk(val
.c_str(), "*?")) {
172 t
= createWildCardTerm(fieldname
, val
);
173 q
= _CLNEW
lucene::search::WildcardQuery(t
);
175 t
= createTerm(fieldname
, val
);
176 q
= _CLNEW
lucene::search::TermQuery(t
);
183 static lucene::search::Query
* createMultiFieldQuery( const Strigi::Query
& query
)
185 lucene::search::BooleanQuery
* bq
= _CLNEW
lucene::search::BooleanQuery();
186 for (vector
<string
>::const_iterator i
= query
.fields().begin();
187 i
!= query
.fields().end(); ++i
) {
188 lucene::search::Query
* q
= createSingleFieldQuery(*i
, query
);
189 bq
->add(q
, true, false, false);
195 static QString
escapeLiteralForSparqlQuery( const QString
& s
)
197 return QString( s
).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
201 class Strigi::Soprano::IndexReader::Private
204 bool createDocument( const Node
& res
, IndexedDocument
& doc
) {
205 StatementIterator it
= repository
->listStatements( Statement( res
, Node(), Node() ) );
206 if ( it
.lastError() ) {
210 // use the resource URI as fallback file URI
211 doc
.uri
= res
.uri().toLocalFile().toUtf8().data();
213 while ( it
.next() ) {
215 if ( s
.object().isLiteral() ) {
216 std::string fieldName
= Util::fieldName( s
.predicate().uri() );
217 std::string value
= s
.object().toString().toUtf8().data();
219 if (fieldName
== "text") {
220 doc
.fragment
= value
;
222 else if (fieldName
== FieldRegister::pathFieldName
) {
223 qDebug() << "Setting IndexedDocument uri=" << value
.c_str();
226 else if (fieldName
== FieldRegister::mimetypeFieldName
) {
227 doc
.mimetype
= value
;
229 else if (fieldName
== FieldRegister::mtimeFieldName
) {
230 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
231 if ( s
.object().literal().isDateTime() ) {
232 doc
.mtime
= s
.object().literal().toDateTime().toTime_t();
235 doc
.mtime
= s
.object().literal().toUnsignedInt();
238 else if (fieldName
== FieldRegister::sizeFieldName
) {
239 doc
.size
= s
.object().literal().toInt64();
242 doc
.properties
.insert( make_pair
<const string
, string
>( fieldName
, value
) );
246 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
247 // not literal statements
254 // ::Soprano::Index::IndexFilterModel* repository;
255 ::Soprano::Model
* repository
;
259 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model
* model
)
260 : Strigi::IndexReader()
262 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
264 d
->repository
= model
;
268 Strigi::Soprano::IndexReader::~IndexReader()
270 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
275 int32_t Strigi::Soprano::IndexReader::countHits( const Query
& query
)
277 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
279 lucene::search::Query
* q
= createQuery( query
);
280 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( q
->toString(), true ),
281 ::Soprano::Query::QueryLanguageUser
,
282 QLatin1String( "lucene" ) );
283 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
285 while ( hits
.next() ) {
286 qDebug() << "Query hit:" << hits
.binding( 0 );
294 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query
& query
,
295 const std::vector
<std::string
>& fields
,
296 const std::vector
<Strigi::Variant::Type
>& types
,
297 std::vector
<std::vector
<Strigi::Variant
> >& result
,
300 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
301 lucene::search::Query
* bq
= createQuery( query
);
302 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( bq
->toString(), true ),
303 ::Soprano::Query::QueryLanguageUser
,
304 QLatin1String( "lucene" ) );
305 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
308 while ( hits
.next() ) {
317 // ::Soprano::Index::QueryHit hit = *hits;
318 std::vector
<Strigi::Variant
> resultRow
;
319 std::vector
<std::string
>::const_iterator fieldIt
= fields
.begin();
320 std::vector
<Strigi::Variant::Type
>::const_iterator typesIt
= types
.begin();
321 while ( fieldIt
!= fields
.end() ) {
322 if ( typesIt
== types
.end() ) {
323 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
327 StatementIterator it
= d
->repository
->listStatements( Statement( hits
.binding( "resource" ),
328 Util::fieldUri( *fieldIt
),
330 // FIXME: what if we have a field with a cardinality > 1?
332 resultRow
.push_back( Util::nodeToVariant( it
.current().object() ) );
335 resultRow
.push_back( Strigi::Variant() );
342 result
.push_back( resultRow
);
348 std::vector
<Strigi::IndexedDocument
> Strigi::Soprano::IndexReader::query( const Query
& query
, int off
, int max
)
350 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
351 vector
<IndexedDocument
> results
;
352 lucene::search::Query
* bq
= createQuery( query
);
353 ::Soprano::QueryResultIterator hits
= d
->repository
->executeQuery( TString( bq
->toString(), true ),
354 ::Soprano::Query::QueryLanguageUser
,
355 QLatin1String( "lucene" ) );
356 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
359 while ( hits
.next() ) {
368 IndexedDocument result
;
369 // ::Soprano::Index::QueryHit hit = *hits;
370 result
.score
= hits
.binding( 1 ).literal().toDouble();
371 if ( d
->createDocument( hits
.binding( 0 ), result
) ) {
372 results
.push_back( result
);
375 qDebug() << "Failed to create indexed document for resource " << hits
.binding( 0 ) << ": " << d
->repository
->lastError();
383 // an empty parent url is perfectly valid as strigi stores a parent url for everything
384 void Strigi::Soprano::IndexReader::getChildren( const std::string
& parent
,
385 std::map
<std::string
, time_t>& children
)
387 // qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
388 QString query
= QString( "select distinct ?path ?mtime where { "
389 "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
393 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName
).toString(),
394 escapeLiteralForSparqlQuery( QString::fromUtf8( parent
.c_str() ) ),
395 Vocabulary::XMLSchema::string().toString(),
396 Util::fieldUri( FieldRegister::mtimeFieldName
).toString(),
397 Util::fieldUri( FieldRegister::pathFieldName
).toString(),
398 Node( QUrl::fromLocalFile( QFile::decodeName( parent
.c_str() ) ) ).toN3() );
400 // qDebug() << "running getChildren query:" << query;
402 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QueryLanguageSparql
);
404 while ( result
.next() ) {
405 Node pathNode
= result
.binding( "path" );
406 Node mTimeNode
= result
.binding( "mtime" );
407 // qDebug() << "file in index: " << pathNode.toString() << "mtime:" << mTimeNode.literal().toDateTime() << "(" << mTimeNode.literal().toDateTime().toTime_t() << ")";
409 // be backwards compatible in case there are paths left encoded as literals
411 if ( pathNode
.isLiteral() )
412 path
= pathNode
.toString().toUtf8().data();
414 path
= QFile::encodeName( pathNode
.uri().toLocalFile() ).data();
416 // Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
417 if ( mTimeNode
.literal().isDateTime() ) {
418 children
[path
] = mTimeNode
.literal().toDateTime().toTime_t();
421 children
[path
] = mTimeNode
.literal().toUnsignedInt();
427 int32_t Strigi::Soprano::IndexReader::countDocuments()
429 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
430 // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
435 int32_t Strigi::Soprano::IndexReader::countWords()
437 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
438 // FIXME: what to do here? use the index? Count the predicates?
443 int64_t Strigi::Soprano::IndexReader::indexSize()
445 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
446 return d
->repository
->statementCount();
450 time_t Strigi::Soprano::IndexReader::mTime( const std::string
& uri
)
452 // qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
453 QString query
= QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
454 .arg( Util::fieldUri( FieldRegister::mtimeFieldName
).toString(),
455 Util::fieldUri( FieldRegister::pathFieldName
).toString(),
456 escapeLiteralForSparqlQuery( QString::fromUtf8( uri
.c_str() ) ),
457 Vocabulary::XMLSchema::string().toString() );
459 qDebug() << "mTime( " << uri
.c_str() << ") query:" << query
;
461 QueryResultIterator it
= d
->repository
->executeQuery( query
, ::Soprano::Query::QueryLanguageSparql
);
465 ::Soprano::LiteralValue val
= it
.binding( "mtime" ).literal();
467 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
468 if ( val
.isDateTime() ) {
469 mtime
= val
.toDateTime().toTime_t();
472 mtime
= val
.toUnsignedInt();
479 std::vector
<std::string
> Strigi::Soprano::IndexReader::fieldNames()
481 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
482 // This is a weird method
483 // Our list of field names (the predicates) is probably awefully long.
485 std::vector
<std::string
> fields
;
486 QueryResultIterator it
= d
->repository
->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QueryLanguageSparql
);
487 while ( it
.next() ) {
488 fields
.push_back( Util::fieldName( it
.binding("p").uri() ) );
494 std::vector
<std::pair
<std::string
,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string
& query
,
495 const std::string
& fieldname
,
496 const std::string
& labeltype
)
502 // FIXME: what is meant by fieldname and labeltype?
503 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
504 // IMPLEMENTME? Seems not like a very important method though.
505 return std::vector
<std::pair
<std::string
,uint32_t> >();
509 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string
& keywordprefix
,
510 const std::vector
<std::string
>& fieldnames
)
512 Q_UNUSED(keywordprefix
);
513 Q_UNUSED(fieldnames
);
515 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
516 // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
521 std::vector
<std::string
> Strigi::Soprano::IndexReader::keywords( const std::string
& keywordmatch
,
522 const std::vector
<std::string
>& fieldnames
,
523 uint32_t max
, uint32_t offset
)
525 Q_UNUSED(keywordmatch
);
526 Q_UNUSED(fieldnames
);
530 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
531 // IMPLEMENTME? Seems like a rarely used method...
532 return std::vector
<std::string
>();