add more spacing
[personal-kdebase.git] / runtime / nepomuk / strigibackend / sopranoindexreader.cpp
blobe95cc0b6c345df99b195b6276428ccde8ec8af44
1 /*
2 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of
7 the License, or (at your option) any later version.
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this library; see the file COPYING. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
20 #include "sopranoindexreader.h"
21 #include "tstring.h"
22 #include <strigi/query.h>
23 #include <strigi/queryparser.h>
24 #include <strigi/fieldtypes.h>
25 #include "util.h"
27 #include <Soprano/Soprano>
28 #include <Soprano/Index/IndexFilterModel>
29 #include <Soprano/Index/CLuceneIndex>
30 #include <Soprano/Vocabulary/XMLSchema>
32 #include <map>
33 #include <utility>
34 #include <sstream>
36 #include <CLucene.h>
38 #include <QtCore/QThread>
39 #include <QtCore/QDateTime>
40 #include <QtCore/QDebug>
41 #include <QtCore/QString>
42 #include <QtCore/QLatin1String>
43 #include <QtCore/QFile>
46 using namespace Soprano;
49 static lucene::index::Term* createWildCardTerm( const TString& name,
50 const string& value );
51 static lucene::index::Term* createTerm( const TString& name,
52 const string& value );
53 static lucene::index::Term* createKeywordTerm( const TString& name,
54 const string& value );
55 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
56 static lucene::search::Query* createQuery( const Strigi::Query& query );
57 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
58 static lucene::search::Query* createSingleFieldQuery( const string& field,
59 const Strigi::Query& query );
60 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
62 #if 0
63 static QString luceneQueryEscape( const QString& s )
65 /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~ : \ */
67 static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
68 QString es( s );
69 es.replace( rx, "\\\\1" );
70 return es;
72 #endif
74 static lucene::index::Term* createWildCardTerm( const TString& name,
75 const string& value )
77 TString v = TString::fromUtf8( value.c_str() );
78 return _CLNEW lucene::index::Term( name.data(), v.data() );
81 static lucene::index::Term* createTerm( const TString& name,
82 const string& value )
84 qDebug() << "createTerm" << name << value.c_str();
86 TString v = TString::fromUtf8( value.c_str() );
88 lucene::util::StringReader sr( v.data() );
89 lucene::analysis::standard::StandardAnalyzer a;
90 lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
91 lucene::analysis::Token* to = ts->next();
92 const wchar_t *tv;
93 if (to) {
94 tv = to->termText();
95 } else {
96 tv = v.data();
98 lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
99 if (to) {
100 _CLDELETE(to);
102 _CLDELETE(ts);
103 return t;
106 static lucene::index::Term* createKeywordTerm( const TString& name,
107 const string& value )
109 TString v = TString::fromUtf8( value.c_str() );
110 lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
111 return t;
114 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
116 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
117 bool isAnd = query.type() == Strigi::Query::And;
118 const vector<Strigi::Query>& sub = query.subQueries();
119 for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
120 lucene::search::Query* q = createQuery(*i);
121 bq->add(q, true, isAnd, i->negate());
123 return bq;
126 static lucene::search::Query* createQuery( const Strigi::Query& query )
128 return query.subQueries().size()
129 ? createBooleanQuery(query)
130 : createSimpleQuery(query);
133 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
135 switch (query.fields().size()) {
136 case 0: return createSingleFieldQuery("text", query);
137 case 1: return createSingleFieldQuery(query.fields()[0], query);
138 default: return createMultiFieldQuery(query);
142 static lucene::search::Query* createSingleFieldQuery( const string& field,
143 const Strigi::Query& query ) {
144 qDebug() << "Creating single field query: " << field.c_str();
145 TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
146 lucene::search::Query* q;
147 lucene::index::Term* t;
148 const string& val = query.term().string();
149 switch (query.type()) {
150 case Strigi::Query::LessThan:
151 t = createTerm(fieldname, val.c_str());
152 q = _CLNEW lucene::search::RangeQuery(0, t, false);
153 break;
154 case Strigi::Query::LessThanEquals:
155 t = createTerm(fieldname, query.term().string());
156 q = _CLNEW lucene::search::RangeQuery(0, t, true);
157 break;
158 case Strigi::Query::GreaterThan:
159 t = createTerm(fieldname, query.term().string());
160 q = _CLNEW lucene::search::RangeQuery(t, 0, false);
161 break;
162 case Strigi::Query::GreaterThanEquals:
163 t = createTerm(fieldname, query.term().string());
164 q = _CLNEW lucene::search::RangeQuery(t, 0, true);
165 break;
166 case Strigi::Query::Keyword:
167 t = createKeywordTerm(fieldname, query.term().string());
168 q = _CLNEW lucene::search::TermQuery(t);
169 break;
170 default:
171 if (strpbrk(val.c_str(), "*?")) {
172 t = createWildCardTerm(fieldname, val);
173 q = _CLNEW lucene::search::WildcardQuery(t);
174 } else {
175 t = createTerm(fieldname, val);
176 q = _CLNEW lucene::search::TermQuery(t);
179 _CLDECDELETE(t);
180 return q;
183 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
185 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
186 for (vector<string>::const_iterator i = query.fields().begin();
187 i != query.fields().end(); ++i) {
188 lucene::search::Query* q = createSingleFieldQuery(*i, query);
189 bq->add(q, true, false, false);
191 return bq;
195 static QString escapeLiteralForSparqlQuery( const QString& s )
197 return QString( s ).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
201 class Strigi::Soprano::IndexReader::Private
203 public:
204 bool createDocument( const Node& res, IndexedDocument& doc ) {
205 StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
206 if ( it.lastError() ) {
207 return false;
210 // use the resource URI as fallback file URI
211 doc.uri = res.uri().toLocalFile().toUtf8().data();
213 while ( it.next() ) {
214 Statement s = *it;
215 if ( s.object().isLiteral() ) {
216 std::string fieldName = Util::fieldName( s.predicate().uri() );
217 std::string value = s.object().toString().toUtf8().data();
219 if (fieldName == "text") {
220 doc.fragment = value;
222 else if (fieldName == FieldRegister::pathFieldName) {
223 qDebug() << "Setting IndexedDocument uri=" << value.c_str();
224 doc.uri = value;
226 else if (fieldName == FieldRegister::mimetypeFieldName) {
227 doc.mimetype = value;
229 else if (fieldName == FieldRegister::mtimeFieldName) {
230 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
231 if ( s.object().literal().isDateTime() ) {
232 doc.mtime = s.object().literal().toDateTime().toTime_t();
234 else {
235 doc.mtime = s.object().literal().toUnsignedInt();
238 else if (fieldName == FieldRegister::sizeFieldName) {
239 doc.size = s.object().literal().toInt64();
241 else {
242 doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
245 else {
246 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
247 // not literal statements
251 return true;
254 // ::Soprano::Index::IndexFilterModel* repository;
255 ::Soprano::Model* repository;
259 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
260 : Strigi::IndexReader()
262 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
263 d = new Private;
264 d->repository = model;
268 Strigi::Soprano::IndexReader::~IndexReader()
270 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
271 delete d;
275 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
277 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
279 lucene::search::Query* q = createQuery( query );
280 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
281 ::Soprano::Query::QueryLanguageUser,
282 QLatin1String( "lucene" ) );
283 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
284 int s = 0;
285 while ( hits.next() ) {
286 qDebug() << "Query hit:" << hits.binding( 0 );
287 ++s;
289 _CLDELETE(q);
290 return s;
294 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
295 const std::vector<std::string>& fields,
296 const std::vector<Strigi::Variant::Type>& types,
297 std::vector<std::vector<Strigi::Variant> >& result,
298 int off, int max )
300 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
301 lucene::search::Query* bq = createQuery( query );
302 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
303 ::Soprano::Query::QueryLanguageUser,
304 QLatin1String( "lucene" ) );
305 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
307 int i = -1;
308 while ( hits.next() ) {
309 ++i;
310 if ( i < off ) {
311 continue;
313 if ( i > max ) {
314 break;
317 // ::Soprano::Index::QueryHit hit = *hits;
318 std::vector<Strigi::Variant> resultRow;
319 std::vector<std::string>::const_iterator fieldIt = fields.begin();
320 std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
321 while ( fieldIt != fields.end() ) {
322 if ( typesIt == types.end() ) {
323 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
324 return;
327 StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
328 Util::fieldUri( *fieldIt ),
329 Node() ) );
330 // FIXME: what if we have a field with a cardinality > 1?
331 if ( it.next() ) {
332 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
334 else {
335 resultRow.push_back( Strigi::Variant() );
338 ++fieldIt;
339 ++typesIt;
342 result.push_back( resultRow );
344 _CLDELETE(bq);
348 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
350 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
351 vector<IndexedDocument> results;
352 lucene::search::Query* bq = createQuery( query );
353 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
354 ::Soprano::Query::QueryLanguageUser,
355 QLatin1String( "lucene" ) );
356 // Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
358 int i = -1;
359 while ( hits.next() ) {
360 ++i;
361 if ( i < off ) {
362 continue;
364 if ( i > max ) {
365 break;
368 IndexedDocument result;
369 // ::Soprano::Index::QueryHit hit = *hits;
370 result.score = hits.binding( 1 ).literal().toDouble();
371 if ( d->createDocument( hits.binding( 0 ), result ) ) {
372 results.push_back( result );
374 else {
375 qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
378 _CLDELETE(bq);
379 return results;
383 // an empty parent url is perfectly valid as strigi stores a parent url for everything
384 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
385 std::map<std::string, time_t>& children )
387 // qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
388 QString query = QString( "select distinct ?path ?mtime where { "
389 "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
390 "?r <%4> ?mtime . "
391 "?r <%5> ?path . "
392 "}")
393 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString(),
394 escapeLiteralForSparqlQuery( QString::fromUtf8( parent.c_str() ) ),
395 Vocabulary::XMLSchema::string().toString(),
396 Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
397 Util::fieldUri( FieldRegister::pathFieldName ).toString(),
398 Node( QUrl::fromLocalFile( QFile::decodeName( parent.c_str() ) ) ).toN3() );
400 // qDebug() << "running getChildren query:" << query;
402 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
404 while ( result.next() ) {
405 Node pathNode = result.binding( "path" );
406 Node mTimeNode = result.binding( "mtime" );
407 // qDebug() << "file in index: " << pathNode.toString() << "mtime:" << mTimeNode.literal().toDateTime() << "(" << mTimeNode.literal().toDateTime().toTime_t() << ")";
409 // be backwards compatible in case there are paths left encoded as literals
410 std::string path;
411 if ( pathNode.isLiteral() )
412 path = pathNode.toString().toUtf8().data();
413 else
414 path = QFile::encodeName( pathNode.uri().toLocalFile() ).data();
416 // Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
417 if ( mTimeNode.literal().isDateTime() ) {
418 children[path] = mTimeNode.literal().toDateTime().toTime_t();
420 else {
421 children[path] = mTimeNode.literal().toUnsignedInt();
427 int32_t Strigi::Soprano::IndexReader::countDocuments()
429 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
430 // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
431 return 0;
435 int32_t Strigi::Soprano::IndexReader::countWords()
437 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
438 // FIXME: what to do here? use the index? Count the predicates?
439 return -1;
443 int64_t Strigi::Soprano::IndexReader::indexSize()
445 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
446 return d->repository->statementCount();
450 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
452 // qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
453 QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
454 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
455 Util::fieldUri( FieldRegister::pathFieldName ).toString(),
456 escapeLiteralForSparqlQuery( QString::fromUtf8( uri.c_str() ) ),
457 Vocabulary::XMLSchema::string().toString() );
459 qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
461 QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
463 time_t mtime = 0;
464 if ( it.next() ) {
465 ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
467 // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
468 if ( val.isDateTime() ) {
469 mtime = val.toDateTime().toTime_t();
471 else {
472 mtime = val.toUnsignedInt();
475 return mtime;
479 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
481 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
482 // This is a weird method
483 // Our list of field names (the predicates) is probably awefully long.
485 std::vector<std::string> fields;
486 QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QueryLanguageSparql );
487 while ( it.next() ) {
488 fields.push_back( Util::fieldName( it.binding("p").uri() ) );
490 return fields;
494 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
495 const std::string& fieldname,
496 const std::string& labeltype )
498 Q_UNUSED(query);
499 Q_UNUSED(fieldname);
500 Q_UNUSED(labeltype);
502 // FIXME: what is meant by fieldname and labeltype?
503 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
504 // IMPLEMENTME? Seems not like a very important method though.
505 return std::vector<std::pair<std::string,uint32_t> >();
509 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
510 const std::vector<std::string>& fieldnames)
512 Q_UNUSED(keywordprefix);
513 Q_UNUSED(fieldnames);
515 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
516 // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
517 return 2;
521 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
522 const std::vector<std::string>& fieldnames,
523 uint32_t max, uint32_t offset )
525 Q_UNUSED(keywordmatch);
526 Q_UNUSED(fieldnames);
527 Q_UNUSED(max);
528 Q_UNUSED(offset);
530 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
531 // IMPLEMENTME? Seems like a rarely used method...
532 return std::vector<std::string>();