add more spacing
[personal-kdebase.git] / runtime / nepomuk / strigibackend / sopranoindexwriter.cpp
blob3ff9a6c45cfeab0827f04a341a69b00d0b821bde
1 /*
2 Copyright (C) 2007-2008 Sebastian Trueg <trueg@kde.org>
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of
7 the License, or (at your option) any later version.
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this library; see the file COPYING. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
20 #include "sopranoindexwriter.h"
21 #include "util.h"
23 #include <Soprano/Soprano>
24 #include <Soprano/Vocabulary/RDF>
25 #include <Soprano/Vocabulary/Xesam>
26 #include <Soprano/LiteralValue>
28 #include <QtCore/QList>
29 #include <QtCore/QHash>
30 #include <QtCore/QVariant>
31 #include <QtCore/QFileInfo>
32 #include <QtCore/QFile>
33 #include <QtCore/QUrl>
34 #include <QtCore/QDebug>
35 #include <QtCore/QThread>
36 #include <QtCore/QDateTime>
37 #include <QtCore/QByteArray>
38 #include <QtCore/QUuid>
40 #include <KUrl>
42 #include <sys/stat.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <errno.h>
47 #include <map>
48 #include <sstream>
49 #include <algorithm>
52 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
54 using namespace Soprano;
57 uint qHash( const std::string& s )
59 return qHash( s.c_str() );
62 namespace {
63 QString findArchivePath( const QString& path ) {
64 QString p( path );
65 int i = 0;
66 while ( ( i = p.lastIndexOf( '/' ) ) > 0 ) {
67 p.truncate( i );
68 if ( QFileInfo( p ).isFile() ) {
69 return p;
72 return QString();
75 QUrl createResourceUri( const Strigi::AnalysisResult* idx ) {
76 // HACK: Strigi includes analysers that recurse into tar or zip archives and index
77 // the files therein. In KDE these files could perfectly be handled through kio slaves
78 // such as tar:/ or zip:/
79 // Here we try to use KDE-compatible URIs for these indexed files the best we can
80 // everything else defaults to file:/
81 QUrl uri;
82 QString path = QFile::decodeName( idx->path().c_str() );
83 if ( KUrl::isRelativeUrl( path ) )
84 uri = QUrl::fromLocalFile( QFileInfo( path ).absoluteFilePath() );
85 else
86 uri = KUrl( path ); // try to support http and other URLs
88 if ( idx->depth() > 0 ) {
89 QString archivePath = findArchivePath( path );
90 if ( QFile::exists( archivePath ) ) {
91 if ( archivePath.endsWith( QLatin1String( ".tar" ) ) ||
92 archivePath.endsWith( QLatin1String( ".tar.gz" ) ) ||
93 archivePath.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
94 uri.setScheme( "tar" );
96 else if ( archivePath.endsWith( QLatin1String( ".zip" ) ) ) {
97 uri.setScheme( "zip" );
102 // fallback for all
103 if ( uri.scheme().isEmpty() ) {
104 uri.setScheme( "file" );
107 return uri;
110 QUrl createGraphUri() {
111 return QUrl( "urn:nepomuk:local:" + QUuid::createUuid().toString().remove( QRegExp( "[\\{\\}]" ) ) );
114 class FileMetaData
116 public:
117 // caching URIs for little speed improvement
118 QUrl fileUri;
119 QUrl context;
120 std::string content;
123 class RegisteredFieldData
125 public:
126 RegisteredFieldData( const QUrl& prop, QVariant::Type t )
127 : property( prop ),
128 dataType( t ),
129 isRdfType( prop == Vocabulary::RDF::type() ) {
132 QUrl property;
133 QVariant::Type dataType;
134 bool isRdfType;
139 class Strigi::Soprano::IndexWriter::Private
141 public:
142 Private()
143 : indexTransactionID( 0 ) {
144 literalTypes[FieldRegister::stringType] = QVariant::String;
145 literalTypes[FieldRegister::floatType] = QVariant::Double;
146 literalTypes[FieldRegister::integerType] = QVariant::Int;
147 literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
148 literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
151 QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
152 // it looks as if the typeUri can contain arbitrary values, URIs or stuff like "string"
153 QHash<std::string, QVariant::Type>::const_iterator it = literalTypes.constFind( strigiType.typeUri() );
154 if ( it == literalTypes.constEnd() ) {
155 return LiteralValue::typeFromDataTypeUri( QUrl::fromEncoded( strigiType.typeUri().c_str() ) );
157 else {
158 return *it;
162 LiteralValue createLiteralValue( QVariant::Type type,
163 const unsigned char* data,
164 uint32_t size ) {
165 QString value = QString::fromUtf8( ( const char* )data, size );
166 if ( type == QVariant::DateTime ) { // dataTime is stored as integer in strigi!
167 return LiteralValue( QDateTime::fromTime_t( value.toUInt() ) );
169 else if ( type != QVariant::Invalid ) {
170 return LiteralValue::fromString( value, type );
172 else {
173 // we default to string
174 return LiteralValue( value );
178 ::Soprano::Model* repository;
179 int indexTransactionID;
181 private:
182 QHash<std::string, QVariant::Type> literalTypes;
186 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model* model )
187 : Strigi::IndexWriter()
189 // qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
190 d = new Private;
191 d->repository = model;
192 Util::storeStrigiMiniOntology( d->repository );
193 // qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
197 Strigi::Soprano::IndexWriter::~IndexWriter()
199 delete d;
203 void Strigi::Soprano::IndexWriter::commit()
208 // delete all indexed data for the files listed in entries
209 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector<std::string>& entries )
211 // qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
213 QString systemLocationUri = Util::fieldUri( FieldRegister::pathFieldName ).toString();
214 for ( unsigned int i = 0; i < entries.size(); ++i ) {
215 QString path = QString::fromUtf8( entries[i].c_str() );
216 QString query = QString( "select ?g ?mg where { "
217 "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
218 "?g <%4> ?r . "
219 "OPTIONAL { ?mg <%5> ?g . } }" )
220 .arg( systemLocationUri )
221 .arg( path )
222 .arg( Vocabulary::XMLSchema::string().toString() )
223 .arg( Strigi::Ontology::indexGraphFor().toString() )
224 .arg( Vocabulary::NRL::coreGraphMetadataFor().toString() )
225 .arg( Node( QUrl::fromLocalFile( path ) ).toN3() );
227 qDebug() << "deleteEntries query:" << query;
229 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
230 if ( result.next() ) {
231 Node indexGraph = result.binding( "g" );
232 Node metaDataGraph = result.binding( "mg" );
234 result.close();
236 // delete the indexed data
237 d->repository->removeContext( indexGraph );
239 // delete the metadata (backwards compatible)
240 if ( metaDataGraph.isValid() )
241 d->repository->removeContext( metaDataGraph );
242 else
243 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
249 void Strigi::Soprano::IndexWriter::deleteAllEntries()
251 // qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
253 // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
254 QString query = QString( "select ?g where { ?g <%1> ?r . }" ).arg( Strigi::Ontology::indexGraphFor().toString() );
256 qDebug() << "deleteAllEntries query:" << query;
258 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
259 QList<Node> allIndexGraphs = result.iterateBindings( "g" ).allNodes();
260 for ( QList<Node>::const_iterator it = allIndexGraphs.constBegin(); it != allIndexGraphs.constEnd(); ++it ) {
261 Node indexGraph = *it;
263 qDebug() << "Found indexGraph to delete:" << indexGraph;
265 // delete the indexed data
266 d->repository->removeContext( indexGraph );
268 // delete the metadata
269 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
274 // called for each indexed file
275 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult* idx )
277 if ( idx->depth() > 0 ) {
278 return;
281 // qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
282 FileMetaData* data = new FileMetaData();
283 data->fileUri = createResourceUri( idx );
285 // let's check if we already have data on the file
286 StatementIterator it = d->repository->listStatements( Node(),
287 Strigi::Ontology::indexGraphFor(),
288 data->fileUri );
289 if ( it.next() ) {
290 data->context = it.current().subject().uri();
292 else {
293 data->context = createGraphUri();
296 // qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
298 idx->setWriterData( data );
302 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult* idx, const char* text, int32_t length )
304 if ( idx->depth() > 0 ) {
305 return;
308 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
309 md->content.append( text, length );
313 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
314 const RegisteredField* field,
315 const std::string& value )
317 if ( idx->depth() > 0 ) {
318 return;
321 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
322 if ( value.length() > 0 ) {
323 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
324 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
326 // Strigi uses rdf:type improperly since it stores the value as a string. We have to
327 // make sure it is a resource.
328 if ( rfd->isRdfType ) {
329 d->repository->addStatement( md->fileUri,
330 ::Soprano::Vocabulary::RDF::type(),
331 QUrl::fromEncoded( value.c_str(), QUrl::StrictMode ),
332 md->context );
334 else {
335 // we bend the plain strigi properties into something nicer, also because we do not want paths to be indexed, way too many false positives
336 // in standard desktop searches
337 if ( field->key() == FieldRegister::pathFieldName ||
338 field->key() == FieldRegister::parentLocationFieldName ) {
339 d->repository->addStatement( md->fileUri,
340 rfd->property,
341 QUrl::fromLocalFile( QFile::decodeName( QByteArray::fromRawData( value.c_str(), value.length() ) ) ),
342 md->context );
344 else {
345 d->repository->addStatement( Statement( md->fileUri,
346 rfd->property,
347 d->createLiteralValue( rfd->dataType, ( unsigned char* )value.c_str(), value.length() ),
348 md->context) );
351 if ( d->repository->lastError() )
352 qDebug() << "Failed to add value" << value.c_str();
354 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
358 // the main addValue method
359 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
360 const RegisteredField* field,
361 const unsigned char* data,
362 uint32_t size )
364 addValue( idx, field, std::string( ( const char* )data, size ) );
368 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult*, const RegisteredField*,
369 const std::string&, const std::string& )
371 // we do not support map types
375 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
376 const RegisteredField* field,
377 uint32_t value )
379 if ( idx->depth() > 0 ) {
380 return;
383 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
384 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
385 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
387 LiteralValue val( value );
388 if ( field->type() == FieldRegister::datetimeType ) {
389 val = QDateTime::fromTime_t( value );
392 d->repository->addStatement( Statement( md->fileUri,
393 rfd->property,
394 val,
395 md->context) );
396 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
400 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
401 const RegisteredField* field,
402 int32_t value )
404 if ( idx->depth() > 0 ) {
405 return;
408 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
409 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
410 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
412 d->repository->addStatement( Statement( md->fileUri,
413 rfd->property,
414 LiteralValue( value ),
415 md->context) );
416 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
420 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
421 const RegisteredField* field,
422 double value )
424 if ( idx->depth() > 0 ) {
425 return;
428 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
429 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
430 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
432 d->repository->addStatement( Statement( md->fileUri,
433 rfd->property,
434 LiteralValue( value ),
435 md->context) );
436 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
440 void Strigi::Soprano::IndexWriter::addTriplet( const std::string& subject,
441 const std::string& predicate, const std::string& object )
443 // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
444 // next commit()?
446 // FIXME: create an NRL metadata graph
447 d->repository->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject.c_str() ) ) ),
448 Node( QUrl( QString::fromUtf8( predicate.c_str() ) ) ),
449 Node( QUrl( QString::fromUtf8( object.c_str() ) ) ),
450 Node() ) );
454 // called after each indexed file
455 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult* idx )
457 if ( idx->depth() > 0 ) {
458 return;
461 // qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
462 FileMetaData* md = static_cast<FileMetaData*>( idx->writerData() );
464 if ( md->content.length() > 0 ) {
465 d->repository->addStatement( Statement( md->fileUri,
466 Vocabulary::Xesam::asText(),
467 LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
468 md->context ) );
469 if ( d->repository->lastError() )
470 qDebug() << "Failed to add" << md->fileUri << "as text" << QString::fromUtf8( md->content.c_str() );
473 // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
474 // Thus, here we go the easy way and mark each indexed file as a xesam:File.
475 if ( QFileInfo( QFile::decodeName( idx->path().c_str() ) ).isDir() )
476 d->repository->addStatement( Statement( md->fileUri,
477 Vocabulary::RDF::type(),
478 Vocabulary::Xesam::Folder(),
479 md->context ) );
480 else
481 d->repository->addStatement( Statement( md->fileUri,
482 Vocabulary::RDF::type(),
483 Vocabulary::Xesam::File(),
484 md->context ) );
487 // create the provedance data for the data graph
488 // TODO: add more data at some point when it becomes of interest
489 QUrl metaDataContext = md->context.toString() + "-metadata";
490 d->repository->addStatement( Statement( md->context,
491 Vocabulary::RDF::type(),
492 Vocabulary::NRL::InstanceBase(),
493 metaDataContext ) );
494 d->repository->addStatement( Statement( md->context,
495 Vocabulary::NAO::created(),
496 LiteralValue( QDateTime::currentDateTime() ),
497 metaDataContext ) );
498 d->repository->addStatement( Statement( md->context,
499 Strigi::Ontology::indexGraphFor(),
500 md->fileUri,
501 metaDataContext ) );
502 d->repository->addStatement( Statement( metaDataContext,
503 Vocabulary::RDF::type(),
504 Vocabulary::NRL::GraphMetadata(),
505 metaDataContext ) );
506 d->repository->addStatement( metaDataContext,
507 Vocabulary::NRL::coreGraphMetadataFor(),
508 md->context,
509 metaDataContext );
511 // cleanup
512 delete md;
513 idx->setWriterData( 0 );
515 // qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();
519 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister& f )
521 map<string, RegisteredField*>::const_iterator i;
522 map<string, RegisteredField*>::const_iterator end = f.fields().end();
523 for (i = f.fields().begin(); i != end; ++i) {
524 QUrl prop = Util::fieldUri( i->second->key() );
525 i->second->setWriterData( new RegisteredFieldData( prop,
526 prop == Vocabulary::RDF::type()
527 ? QVariant::Invalid
528 : d->literalType( i->second->properties() ) ) );
533 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister& f )
535 map<string, RegisteredField*>::const_iterator i;
536 map<string, RegisteredField*>::const_iterator end = f.fields().end();
537 for (i = f.fields().begin(); i != end; ++i) {
538 delete static_cast<RegisteredFieldData*>( i->second->writerData() );
539 i->second->setWriterData( 0 );