2 Copyright (C) 2007-2008 Sebastian Trueg <trueg@kde.org>
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of
7 the License, or (at your option) any later version.
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this library; see the file COPYING. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
20 #include "sopranoindexwriter.h"
23 #include <Soprano/Soprano>
24 #include <Soprano/Vocabulary/RDF>
25 #include <Soprano/Vocabulary/Xesam>
26 #include <Soprano/LiteralValue>
28 #include <QtCore/QList>
29 #include <QtCore/QHash>
30 #include <QtCore/QVariant>
31 #include <QtCore/QFileInfo>
32 #include <QtCore/QFile>
33 #include <QtCore/QUrl>
34 #include <QtCore/QDebug>
35 #include <QtCore/QThread>
36 #include <QtCore/QDateTime>
37 #include <QtCore/QByteArray>
38 #include <QtCore/QUuid>
52 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
54 using namespace Soprano
;
57 uint
qHash( const std::string
& s
)
59 return qHash( s
.c_str() );
63 QString
findArchivePath( const QString
& path
) {
66 while ( ( i
= p
.lastIndexOf( '/' ) ) > 0 ) {
68 if ( QFileInfo( p
).isFile() ) {
75 QUrl
createResourceUri( const Strigi::AnalysisResult
* idx
) {
76 // HACK: Strigi includes analysers that recurse into tar or zip archives and index
77 // the files therein. In KDE these files could perfectly be handled through kio slaves
78 // such as tar:/ or zip:/
79 // Here we try to use KDE-compatible URIs for these indexed files the best we can
80 // everything else defaults to file:/
82 QString path
= QFile::decodeName( idx
->path().c_str() );
83 if ( KUrl::isRelativeUrl( path
) )
84 uri
= QUrl::fromLocalFile( QFileInfo( path
).absoluteFilePath() );
86 uri
= KUrl( path
); // try to support http and other URLs
88 if ( idx
->depth() > 0 ) {
89 QString archivePath
= findArchivePath( path
);
90 if ( QFile::exists( archivePath
) ) {
91 if ( archivePath
.endsWith( QLatin1String( ".tar" ) ) ||
92 archivePath
.endsWith( QLatin1String( ".tar.gz" ) ) ||
93 archivePath
.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
94 uri
.setScheme( "tar" );
96 else if ( archivePath
.endsWith( QLatin1String( ".zip" ) ) ) {
97 uri
.setScheme( "zip" );
103 if ( uri
.scheme().isEmpty() ) {
104 uri
.setScheme( "file" );
110 QUrl
createGraphUri() {
111 return QUrl( "urn:nepomuk:local:" + QUuid::createUuid().toString().remove( QRegExp( "[\\{\\}]" ) ) );
117 // caching URIs for little speed improvement
123 class RegisteredFieldData
126 RegisteredFieldData( const QUrl
& prop
, QVariant::Type t
)
129 isRdfType( prop
== Vocabulary::RDF::type() ) {
133 QVariant::Type dataType
;
139 class Strigi::Soprano::IndexWriter::Private
143 : indexTransactionID( 0 ) {
144 literalTypes
[FieldRegister::stringType
] = QVariant::String
;
145 literalTypes
[FieldRegister::floatType
] = QVariant::Double
;
146 literalTypes
[FieldRegister::integerType
] = QVariant::Int
;
147 literalTypes
[FieldRegister::binaryType
] = QVariant::ByteArray
;
148 literalTypes
[FieldRegister::datetimeType
] = QVariant::DateTime
; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
151 QVariant::Type
literalType( const Strigi::FieldProperties
& strigiType
) {
152 // it looks as if the typeUri can contain arbitrary values, URIs or stuff like "string"
153 QHash
<std::string
, QVariant::Type
>::const_iterator it
= literalTypes
.constFind( strigiType
.typeUri() );
154 if ( it
== literalTypes
.constEnd() ) {
155 return LiteralValue::typeFromDataTypeUri( QUrl::fromEncoded( strigiType
.typeUri().c_str() ) );
162 LiteralValue
createLiteralValue( QVariant::Type type
,
163 const unsigned char* data
,
165 QString value
= QString::fromUtf8( ( const char* )data
, size
);
166 if ( type
== QVariant::DateTime
) { // dataTime is stored as integer in strigi!
167 return LiteralValue( QDateTime::fromTime_t( value
.toUInt() ) );
169 else if ( type
!= QVariant::Invalid
) {
170 return LiteralValue::fromString( value
, type
);
173 // we default to string
174 return LiteralValue( value
);
178 ::Soprano::Model
* repository
;
179 int indexTransactionID
;
182 QHash
<std::string
, QVariant::Type
> literalTypes
;
186 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model
* model
)
187 : Strigi::IndexWriter()
189 // qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
191 d
->repository
= model
;
192 Util::storeStrigiMiniOntology( d
->repository
);
193 // qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
197 Strigi::Soprano::IndexWriter::~IndexWriter()
203 void Strigi::Soprano::IndexWriter::commit()
208 // delete all indexed data for the files listed in entries
209 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector
<std::string
>& entries
)
211 // qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
213 QString systemLocationUri
= Util::fieldUri( FieldRegister::pathFieldName
).toString();
214 for ( unsigned int i
= 0; i
< entries
.size(); ++i
) {
215 QString path
= QString::fromUtf8( entries
[i
].c_str() );
216 QString query
= QString( "select ?g ?mg where { "
217 "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
219 "OPTIONAL { ?mg <%5> ?g . } }" )
220 .arg( systemLocationUri
)
222 .arg( Vocabulary::XMLSchema::string().toString() )
223 .arg( Strigi::Ontology::indexGraphFor().toString() )
224 .arg( Vocabulary::NRL::coreGraphMetadataFor().toString() )
225 .arg( Node( QUrl::fromLocalFile( path
) ).toN3() );
227 qDebug() << "deleteEntries query:" << query
;
229 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QueryLanguageSparql
);
230 if ( result
.next() ) {
231 Node indexGraph
= result
.binding( "g" );
232 Node metaDataGraph
= result
.binding( "mg" );
236 // delete the indexed data
237 d
->repository
->removeContext( indexGraph
);
239 // delete the metadata (backwards compatible)
240 if ( metaDataGraph
.isValid() )
241 d
->repository
->removeContext( metaDataGraph
);
243 d
->repository
->removeAllStatements( Statement( indexGraph
, Node(), Node() ) );
249 void Strigi::Soprano::IndexWriter::deleteAllEntries()
251 // qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
253 // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
254 QString query
= QString( "select ?g where { ?g <%1> ?r . }" ).arg( Strigi::Ontology::indexGraphFor().toString() );
256 qDebug() << "deleteAllEntries query:" << query
;
258 QueryResultIterator result
= d
->repository
->executeQuery( query
, ::Soprano::Query::QUERY_LANGUAGE_SPARQL
);
259 QList
<Node
> allIndexGraphs
= result
.iterateBindings( "g" ).allNodes();
260 for ( QList
<Node
>::const_iterator it
= allIndexGraphs
.constBegin(); it
!= allIndexGraphs
.constEnd(); ++it
) {
261 Node indexGraph
= *it
;
263 qDebug() << "Found indexGraph to delete:" << indexGraph
;
265 // delete the indexed data
266 d
->repository
->removeContext( indexGraph
);
268 // delete the metadata
269 d
->repository
->removeAllStatements( Statement( indexGraph
, Node(), Node() ) );
274 // called for each indexed file
275 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult
* idx
)
277 if ( idx
->depth() > 0 ) {
281 // qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
282 FileMetaData
* data
= new FileMetaData();
283 data
->fileUri
= createResourceUri( idx
);
285 // let's check if we already have data on the file
286 StatementIterator it
= d
->repository
->listStatements( Node(),
287 Strigi::Ontology::indexGraphFor(),
290 data
->context
= it
.current().subject().uri();
293 data
->context
= createGraphUri();
296 // qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
298 idx
->setWriterData( data
);
302 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult
* idx
, const char* text
, int32_t length
)
304 if ( idx
->depth() > 0 ) {
308 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
309 md
->content
.append( text
, length
);
313 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
314 const RegisteredField
* field
,
315 const std::string
& value
)
317 if ( idx
->depth() > 0 ) {
321 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
322 if ( value
.length() > 0 ) {
323 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
324 RegisteredFieldData
* rfd
= reinterpret_cast<RegisteredFieldData
*>( field
->writerData() );
326 // Strigi uses rdf:type improperly since it stores the value as a string. We have to
327 // make sure it is a resource.
328 if ( rfd
->isRdfType
) {
329 d
->repository
->addStatement( md
->fileUri
,
330 ::Soprano::Vocabulary::RDF::type(),
331 QUrl::fromEncoded( value
.c_str(), QUrl::StrictMode
),
335 // we bend the plain strigi properties into something nicer, also because we do not want paths to be indexed, way too many false positives
336 // in standard desktop searches
337 if ( field
->key() == FieldRegister::pathFieldName
||
338 field
->key() == FieldRegister::parentLocationFieldName
) {
339 d
->repository
->addStatement( md
->fileUri
,
341 QUrl::fromLocalFile( QFile::decodeName( QByteArray::fromRawData( value
.c_str(), value
.length() ) ) ),
345 d
->repository
->addStatement( Statement( md
->fileUri
,
347 d
->createLiteralValue( rfd
->dataType
, ( unsigned char* )value
.c_str(), value
.length() ),
351 if ( d
->repository
->lastError() )
352 qDebug() << "Failed to add value" << value
.c_str();
354 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
358 // the main addValue method
359 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
360 const RegisteredField
* field
,
361 const unsigned char* data
,
364 addValue( idx
, field
, std::string( ( const char* )data
, size
) );
368 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
*, const RegisteredField
*,
369 const std::string
&, const std::string
& )
371 // we do not support map types
375 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
376 const RegisteredField
* field
,
379 if ( idx
->depth() > 0 ) {
383 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
384 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
385 RegisteredFieldData
* rfd
= reinterpret_cast<RegisteredFieldData
*>( field
->writerData() );
387 LiteralValue
val( value
);
388 if ( field
->type() == FieldRegister::datetimeType
) {
389 val
= QDateTime::fromTime_t( value
);
392 d
->repository
->addStatement( Statement( md
->fileUri
,
396 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
400 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
401 const RegisteredField
* field
,
404 if ( idx
->depth() > 0 ) {
408 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
409 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
410 RegisteredFieldData
* rfd
= reinterpret_cast<RegisteredFieldData
*>( field
->writerData() );
412 d
->repository
->addStatement( Statement( md
->fileUri
,
414 LiteralValue( value
),
416 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
420 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult
* idx
,
421 const RegisteredField
* field
,
424 if ( idx
->depth() > 0 ) {
428 // qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
429 FileMetaData
* md
= reinterpret_cast<FileMetaData
*>( idx
->writerData() );
430 RegisteredFieldData
* rfd
= reinterpret_cast<RegisteredFieldData
*>( field
->writerData() );
432 d
->repository
->addStatement( Statement( md
->fileUri
,
434 LiteralValue( value
),
436 // qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
440 void Strigi::Soprano::IndexWriter::addTriplet( const std::string
& subject
,
441 const std::string
& predicate
, const std::string
& object
)
443 // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
446 // FIXME: create an NRL metadata graph
447 d
->repository
->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject
.c_str() ) ) ),
448 Node( QUrl( QString::fromUtf8( predicate
.c_str() ) ) ),
449 Node( QUrl( QString::fromUtf8( object
.c_str() ) ) ),
454 // called after each indexed file
455 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult
* idx
)
457 if ( idx
->depth() > 0 ) {
461 // qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
462 FileMetaData
* md
= static_cast<FileMetaData
*>( idx
->writerData() );
464 if ( md
->content
.length() > 0 ) {
465 d
->repository
->addStatement( Statement( md
->fileUri
,
466 Vocabulary::Xesam::asText(),
467 LiteralValue( QString::fromUtf8( md
->content
.c_str() ) ),
469 if ( d
->repository
->lastError() )
470 qDebug() << "Failed to add" << md
->fileUri
<< "as text" << QString::fromUtf8( md
->content
.c_str() );
473 // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
474 // Thus, here we go the easy way and mark each indexed file as a xesam:File.
475 if ( QFileInfo( QFile::decodeName( idx
->path().c_str() ) ).isDir() )
476 d
->repository
->addStatement( Statement( md
->fileUri
,
477 Vocabulary::RDF::type(),
478 Vocabulary::Xesam::Folder(),
481 d
->repository
->addStatement( Statement( md
->fileUri
,
482 Vocabulary::RDF::type(),
483 Vocabulary::Xesam::File(),
487 // create the provedance data for the data graph
488 // TODO: add more data at some point when it becomes of interest
489 QUrl metaDataContext
= md
->context
.toString() + "-metadata";
490 d
->repository
->addStatement( Statement( md
->context
,
491 Vocabulary::RDF::type(),
492 Vocabulary::NRL::InstanceBase(),
494 d
->repository
->addStatement( Statement( md
->context
,
495 Vocabulary::NAO::created(),
496 LiteralValue( QDateTime::currentDateTime() ),
498 d
->repository
->addStatement( Statement( md
->context
,
499 Strigi::Ontology::indexGraphFor(),
502 d
->repository
->addStatement( Statement( metaDataContext
,
503 Vocabulary::RDF::type(),
504 Vocabulary::NRL::GraphMetadata(),
506 d
->repository
->addStatement( metaDataContext
,
507 Vocabulary::NRL::coreGraphMetadataFor(),
513 idx
->setWriterData( 0 );
515 // qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();
519 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister
& f
)
521 map
<string
, RegisteredField
*>::const_iterator i
;
522 map
<string
, RegisteredField
*>::const_iterator end
= f
.fields().end();
523 for (i
= f
.fields().begin(); i
!= end
; ++i
) {
524 QUrl prop
= Util::fieldUri( i
->second
->key() );
525 i
->second
->setWriterData( new RegisteredFieldData( prop
,
526 prop
== Vocabulary::RDF::type()
528 : d
->literalType( i
->second
->properties() ) ) );
533 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister
& f
)
535 map
<string
, RegisteredField
*>::const_iterator i
;
536 map
<string
, RegisteredField
*>::const_iterator end
= f
.fields().end();
537 for (i
= f
.fields().begin(); i
!= end
; ++i
) {
538 delete static_cast<RegisteredFieldData
*>( i
->second
->writerData() );
539 i
->second
->setWriterData( 0 );