add more spacing
[personal-kdebase.git] / runtime / nepomuk / services / queryservice / searchthread.cpp
blob05cf2b9bc66250069996335d76528c5668fddba1
1 /*
2 This file is part of the Nepomuk KDE project.
3 Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License version 2 as published by the Free Software Foundation.
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Library General Public License for more details.
14 You should have received a copy of the GNU Library General Public License
15 along with this library; see the file COPYING.LIB. If not, write to
16 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 Boston, MA 02110-1301, USA.
20 #include "searchthread.h"
21 #include "term.h"
22 #include "qurlhash.h"
24 #include <Nepomuk/ResourceManager>
25 #include <Nepomuk/Resource>
26 #include <Nepomuk/Types/Property>
27 #include <Nepomuk/Types/Class>
28 #include <Nepomuk/Types/Literal>
30 #include <Soprano/Version>
31 #include <Soprano/Model>
32 #include <Soprano/QueryResultIterator>
33 #include <Soprano/Node>
34 #include <Soprano/Statement>
35 #include <Soprano/LiteralValue>
36 #include <Soprano/StatementIterator>
37 #include <Soprano/Vocabulary/RDF>
38 #include <Soprano/Vocabulary/RDFS>
39 #include <Soprano/Vocabulary/NRL>
40 #include <Soprano/Vocabulary/NAO>
41 #include <Soprano/Vocabulary/XMLSchema>
42 #include <Soprano/Vocabulary/OWL>
44 #include <KDebug>
46 #include <QtCore/QTime>
50 // FIXME: With our cutoff score we might miss results that are hit multiple times and thus, would get their
51 // score increased
53 #warning Make query optimization methods return an invalid term if the query cannot be resolved and handle this as no results
55 using namespace Soprano;
57 namespace {
58 /**
59 * The maximum number of resources that are matched in resolveValues when converting
60 * an equals or contains term.
62 const int MAX_RESOURCES = 4;
65 void mergeInResult( QHash<QUrl, Nepomuk::Search::Result>& results, const Nepomuk::Search::Result& resource ) {
66 QHash<QUrl, Nepomuk::Search::Result>::iterator old = results.find( resource.resourceUri() );
67 if ( old == results.end() ) {
68 results.insert( resource.resourceUri(), resource );
70 else {
71 // FIXME: how do we join the scores properly? Is adding a good idea? It can certainly not be multiplication!
72 Nepomuk::Search::Result& result = *old;
73 result.setScore( result.score() + resource.score() );
77 void mergeInResults( QHash<QUrl, Nepomuk::Search::Result>& results, const QHash<QUrl, Nepomuk::Search::Result>& otherResults ) {
78 for ( QHash<QUrl, Nepomuk::Search::Result>::const_iterator it = otherResults.constBegin();
79 it != otherResults.constEnd(); ++it ) {
80 mergeInResult( results, it.value() );
84 // This is a copy of Soprano::Index::IndexFilterModel::encodeStringForLuceneQuery
85 // which we do not use to prevent linking to sopranoindex
86 QString luceneQueryEscape( const QString& s ) {
87 /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~ : \ */
89 static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
90 QString es( s );
91 es.replace( rx, "\\\\1" );
92 return es;
95 QString luceneQueryEscape( const QUrl& s ) {
96 return luceneQueryEscape( QString::fromAscii( s.toEncoded() ) );
99 QString createLuceneLiteralQuery( const QString& escaped ) {
100 if ( escaped.contains( QRegExp( "\\s" ) ) ) {
101 return "\"" + escaped + "\"";
103 else {
104 return escaped;
108 QString createLuceneQuery( const Nepomuk::Search::SearchNode& node ) {
109 if ( node.term.type() == Nepomuk::Search::Term::LiteralTerm ) {
110 return createLuceneLiteralQuery( luceneQueryEscape( node.term.value().toString() ) );
112 else if ( node.term.type() == Nepomuk::Search::Term::ComparisonTerm ) {
113 return luceneQueryEscape( node.term.property() ) + ':' + createLuceneLiteralQuery( luceneQueryEscape( node.term.subTerms().first().value().toString() ) );
115 else {
116 Q_ASSERT( node.term.type() == Nepomuk::Search::Term::AndTerm ||
117 node.term.type() == Nepomuk::Search::Term::OrTerm );
119 QStringList sq;
120 foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
121 sq += createLuceneQuery( n );
123 if ( node.term.type() == Nepomuk::Search::Term::AndTerm ) {
124 return " ( " + sq.join( " AND " ) + " ) ";
126 else {
127 return " ( " + sq.join( " OR " ) + " ) ";
132 QString comparatorString( Nepomuk::Search::Term::Comparator c ) {
133 switch( c ) {
134 case Nepomuk::Search::Term::Contains:
135 return ":";
136 case Nepomuk::Search::Term::Equal:
137 return "=";
138 case Nepomuk::Search::Term::Greater:
139 return ">";
140 case Nepomuk::Search::Term::Smaller:
141 return "<";
142 case Nepomuk::Search::Term::GreaterOrEqual:
143 return ">=";
144 case Nepomuk::Search::Term::SmallerOrEqual:
145 return "<=";
147 // make gcc happy
148 return QString();
152 bool isNumberLiteralValue( const Soprano::LiteralValue& value ) {
153 return value.isInt() || value.isInt64() || value.isUnsignedInt() || value.isUnsignedInt64() || value.isDouble();
157 QString createGraphPattern( const Nepomuk::Search::SearchNode& node, int& varCnt, const QString& varName = QString( "?r" ) )
159 switch( node.term.type() ) {
160 case Nepomuk::Search::Term::ComparisonTerm: {
162 Nepomuk::Search::Term subTerm( node.term.subTerms().first() );
165 // is the subterm (we only support one ATM) a final term (no further subterms)
166 // -> actually match the literal or resource
168 if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ||
169 subTerm.type() == Nepomuk::Search::Term::LiteralTerm ) {
170 if( node.term.comparator() != Nepomuk::Search::Term::Equal ) {
171 // For numbers there is no need for quotes + this way we can handle all the xsd decimal types
172 // FIXME: it may be necessary to escape stuff
173 QString filter = QString( "?var%1 %2 " )
174 .arg( ++varCnt )
175 .arg( comparatorString( node.term.comparator() ) );
176 if ( isNumberLiteralValue( subTerm.value() ) ) {
177 filter += subTerm.value().toString();
179 else {
180 Nepomuk::Types::Property prop( node.term.property() );
181 filter += QString( "\"%1\"" ).arg( subTerm.value().toString() );
182 if ( prop.literalRangeType().dataTypeUri().isValid() )
183 filter += QString( "^^<%1>" ).arg( prop.literalRangeType().dataTypeUri().toString() );
186 return QString( "%1 <%2> ?var%3 . FILTER(%4) . " )
187 .arg( varName )
188 .arg( QString::fromAscii( node.term.property().toEncoded() ) )
189 .arg( varCnt )
190 .arg( filter );
192 else {
193 if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ) {
194 return QString( "%1 <%2> <%3> . " )
195 .arg( varName )
196 .arg( QString::fromAscii( node.term.property().toEncoded() ) )
197 .arg( QString::fromAscii( subTerm.resource().toEncoded() ) );
199 else if ( Nepomuk::Types::Property( node.term.property() ).range().isValid() ) {
200 return QString( "%7 <%1> ?x . { ?x <%2> \"%3\"^^<%4> . } UNION { ?x <%5> \"%3\"^^<%4>. } UNION { ?x <%6> \"%3\"^^<%4> . }" )
201 .arg( QString::fromAscii( node.term.property().toEncoded() ) )
202 .arg( Soprano::Vocabulary::RDFS::label().toString() )
203 .arg( subTerm.value().toString() )
204 .arg( Soprano::Vocabulary::XMLSchema::string().toString() )
205 .arg( Soprano::Vocabulary::NAO::prefLabel().toString() )
206 .arg( Soprano::Vocabulary::NAO::identifier().toString() )
207 .arg( varName );
209 else {
210 return QString( "%1 <%2> \"%3\"^^<%4> . " )
211 .arg( varName )
212 .arg( QString::fromAscii( node.term.property().toEncoded() ) )
213 .arg( subTerm.value().toString() )
214 .arg( Nepomuk::Types::Property( node.term.property() ).literalRangeType().dataTypeUri().toString() );
220 // Is the subterm not final, i.e. has further subterms
221 // -> combine graph pattern with subterm graph pattern
223 else {
224 QString bridgeVarName = QString( "?var%1" ).arg( ++varCnt );
225 return QString( "%1 <%2> %3 . " )
226 .arg( varName )
227 .arg( QString::fromAscii( node.term.property().toEncoded() ) )
228 .arg( bridgeVarName )
229 + createGraphPattern( node.subNodes.first(), varCnt, bridgeVarName );
233 case Nepomuk::Search::Term::AndTerm: {
234 QString s( "{ " );
235 foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
236 s += createGraphPattern( n, varCnt );
238 s += "} ";
239 return s;
242 case Nepomuk::Search::Term::OrTerm: {
243 QStringList s;
244 foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
245 s += createGraphPattern( n, varCnt );
247 Q_ASSERT( !s.isEmpty() );
248 return "{ " + s.join( " } UNION { " ) + " } ";
251 default:
252 Q_ASSERT_X( 0, "createGraphPattern", "unsupported Term type" );
255 return QString();
260 Nepomuk::Search::SearchThread::SearchThread( QObject* parent )
261 : QThread( parent )
266 Nepomuk::Search::SearchThread::~SearchThread()
271 void Nepomuk::Search::SearchThread::query( const Query& term, double cutOffScore )
273 if( isRunning() ) {
274 cancel();
277 kDebug() << term << cutOffScore;
279 m_canceled = false;
280 m_searchTerm = term;
281 m_cutOffScore = cutOffScore;
282 m_numResults = 0;
284 start();
288 void Nepomuk::Search::SearchThread::cancel()
290 m_canceled = true;
291 wait();
295 void Nepomuk::Search::SearchThread::run()
297 QTime time;
298 time.start();
300 if ( m_searchTerm.type() == Query::PlainQuery ) {
301 kDebug() << "Plain Query: " << m_searchTerm;
302 Term t = resolveFields( m_searchTerm.term() );
303 kDebug() << "Fields resolved:" << t;
304 t = resolveValues( t );
305 kDebug() << "Values resolved:" << t;
306 t = optimize( t );
307 kDebug() << "Optimized query:" << t;
309 search( splitLuceneSparql( t ) /*optimize( resolveValues( resolveFields( m_searchTerm ) ) )*/, 1.0, true );
311 else {
312 // FIXME: once we have the Soprano query API it should be simple to add the requestProperties here
313 // for now we do it the hacky way
314 QString query = m_searchTerm.sparqlQuery();
315 int pos = query.indexOf( QLatin1String( "where" ) );
316 if ( pos > 0 ) {
317 query.insert( pos, buildRequestPropertyVariableList() + ' ' );
318 pos = query.lastIndexOf( '}' );
319 if ( pos > 0 ) {
320 query.insert( pos, ' ' + buildRequestPropertyPatterns() + ' ' );
324 sparqlQuery( query, 1.0, true );
327 kDebug() << time.elapsed();
331 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveFields( const Term& term )
333 switch( term.type() ) {
334 case Term::AndTerm:
335 case Term::OrTerm: {
336 Term newTerm;
337 newTerm.setType( term.type() );
338 QList<Term> terms = term.subTerms();
339 foreach( const Term& t, terms ) {
340 if ( m_canceled ) break;
341 newTerm.addSubTerm( resolveFields( t ) );
343 return newTerm;
347 case Term::ComparisonTerm: {
348 Term newTerm( term );
349 Term subTerm = term.subTerms().first();
350 if ( subTerm.type() != Term::LiteralTerm &&
351 subTerm.type() != Term::ResourceTerm ) {
352 newTerm.setSubTerms( QList<Term>() << resolveFields( subTerm ) );
355 if ( !newTerm.property().isValid() ) {
356 // FIXME: use the score of the field search as boost factors
357 QList<QUrl> properties = matchFieldName( term.field() );
358 if ( properties.count() > 0 ) {
359 if ( properties.count() == 1 ) {
360 newTerm.setProperty( properties.first() );
361 return newTerm;
363 else {
364 Term orTerm;
365 orTerm.setType( Term::OrTerm );
366 foreach( const QUrl& property, properties ) {
367 Term t( newTerm );
368 t.setProperty( property );
369 orTerm.addSubTerm( t );
371 return orTerm;
374 else {
375 kDebug() << "Failed to resolve field" << term.field() << "to any property!";
376 return Term();
381 default:
382 return term;
387 // precondition: resolveFields needs to be run before this one as it only touches properties
388 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveValues( const Term& term )
390 switch( term.type() ) {
391 case Term::AndTerm:
392 case Term::OrTerm: {
393 Term newTerm;
394 newTerm.setType( term.type() );
395 QList<Term> terms = term.subTerms();
396 foreach( const Term& t, terms ) {
397 if ( m_canceled ) break;
398 newTerm.addSubTerm( resolveValues( t ) );
400 return newTerm;
404 case Term::ComparisonTerm: {
405 // FIXME: we could also handle this via lucene for literals but what is better?
406 // with lucene we have the additional work of getting the requestProperties
408 // FIXME: handle subqueries
411 // ComparisonTerm Terms can contain subterms that again. We do not support
412 // arbitrary subterms but only comparator terms. Here we will only resolve the
413 // last one since all others will be handled in a single SPARQL query.
415 // Also, non-comtains comparators are handled in the SPARQL query as well.
417 // Thus, in the end we only resolve literal contains terms.
419 if ( term.comparator() == Term::Contains &&
420 term.subTerms().first().type() == Term::LiteralTerm ) {
422 Q_ASSERT ( term.property().isValid() );
424 // we only need to augment terms that have a property with
425 // a non-literal range. These will never hit in a lucene query
426 // anyway
427 Nepomuk::Types::Property prop( term.property() );
428 if ( prop.range().isValid() ) {
430 Term orTerm;
431 orTerm.setType( Term::OrTerm );
433 // FIXME: cache the results as it is very well possible that we search the same multiple times
434 // if resolveFields did create an OR term
436 // rdfs:label has a higher priority than any other property
437 // TODO: without being able to query the resource type simple searching for term.value() is waaaaay to slow
438 //QString query = QString( "%1:\"%2\"^4 \"%2\"" )
439 QString query = QString( "%1:\"%2\" OR %3:\"%2\" OR %4:\"%2\"" )
440 .arg( luceneQueryEscape( Soprano::Vocabulary::RDFS::label() ) )
441 .arg( term.subTerms().first().value().toString() )
442 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::prefLabel() ) )
443 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::identifier() ) );
444 Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query,
445 Soprano::Query::QueryLanguageUser,
446 "lucene" );
448 while ( hits.next() ) {
449 if ( m_canceled ) break;
451 // FIXME: use the lucene score as boost factor
452 QUrl hit = hits.binding( 0 ).uri();
453 if ( prop.range().uri() == Soprano::Vocabulary::RDFS::Resource() ||
454 Nepomuk::Resource( hit ).hasType( prop.range().uri() ) ) {
455 orTerm.addSubTerm( Term( term.property(), hit ) );
456 if ( orTerm.subTerms().count() == MAX_RESOURCES ) {
457 break;
462 if ( orTerm.subTerms().count() == 1 ) {
463 return orTerm.subTerms().first();
465 else if ( orTerm.subTerms().count() ) {
466 return orTerm;
468 else {
469 kDebug() << "Failed to match value" << term.subTerms().first().value() << "to any possible resource.";
470 return term;
473 else {
474 // nothing to do here
475 return term;
479 // non-literal term or non-contains term -> handled in SPARQL query
480 else {
481 Term newTerm( term );
482 newTerm.setSubTerms( QList<Term>() << resolveValues( term.subTerms().first() ) );
483 return newTerm;
487 default:
488 return term;
493 Nepomuk::Search::Term Nepomuk::Search::SearchThread::optimize( const Term& term )
495 switch( term.type() ) {
496 case Term::AndTerm:
497 case Term::OrTerm: {
498 QList<Term> subTerms = term.subTerms();
499 QList<Term> newSubTerms;
500 QList<Term>::const_iterator end( subTerms.constEnd() );
501 for ( QList<Term>::const_iterator it = subTerms.constBegin();
502 it != end; ++it ) {
503 const Term& t = *it;
504 Term ot = optimize( t );
505 if ( ot.type() == term.type() ) {
506 newSubTerms += ot.subTerms();
508 else {
509 newSubTerms += ot;
512 Term newTerm;
513 newTerm.setType( term.type() );
514 newTerm.setSubTerms( newSubTerms );
515 return newTerm;
518 default:
519 return term;
524 Nepomuk::Search::SearchNode Nepomuk::Search::SearchThread::splitLuceneSparql( const Term& term )
526 // Goal: separate the terms into 2 groups: literal and resource which are
527 // merged with only one AND or OR action. Is that possible?
529 // For now we will do this (our query lang does not handle nested queries anyway)
530 // LiteralTerm -> one lucene, no sparql
531 // ComparisonTerm -> one lucene, no sparql (resource contains will be resolved to equality above)
532 // AndTerm -> divide all subterms and create two "small" AND terms
533 // OrTerm -> divide all subterms and create two "small" OR terms
535 switch( term.type() ) {
536 case Term::LiteralTerm:
537 return SearchNode( term, SearchNode::Lucene );
539 case Term::ComparisonTerm:
540 if ( term.comparator() == Term::Contains &&
541 term.subTerms().first().type() == Term::LiteralTerm ) {
542 // no need for subnides here - we only use the subterm's value
543 return SearchNode( term, SearchNode::Lucene );
545 else {
546 // all subnodes are resolved and can be handled in a SPARQL query
547 SearchNode node( term, SearchNode::Sparql );
548 node.subNodes += splitLuceneSparql( term.subTerms().first() );
549 return node;
552 case Term::AndTerm:
553 case Term::OrTerm: {
554 QList<Term> subTerms = term.subTerms();
555 QList<SearchNode> luceneNodes, sparqlNodes, unknownNodes;
557 QList<Term>::const_iterator end( subTerms.constEnd() );
558 for ( QList<Term>::const_iterator it = subTerms.constBegin();
559 it != end; ++it ) {
560 SearchNode node = splitLuceneSparql( *it );
561 if ( node.type == SearchNode::Lucene ) {
562 luceneNodes += node;
564 else if ( node.type == SearchNode::Sparql ) {
565 sparqlNodes += node;
567 else {
568 unknownNodes += node;
572 if ( luceneNodes.count() && !sparqlNodes.count() && !unknownNodes.count() ) {
573 return SearchNode( term, SearchNode::Lucene, luceneNodes );
575 else if ( !luceneNodes.count() && sparqlNodes.count() && !unknownNodes.count() ) {
576 return SearchNode( term, SearchNode::Sparql, sparqlNodes );
578 else if ( !luceneNodes.count() && !sparqlNodes.count() && unknownNodes.count() ) {
579 return SearchNode( term, SearchNode::Unknown, unknownNodes );
581 else {
582 Term newTerm;
583 newTerm.setType( term.type() );
584 SearchNode andNode( newTerm );
585 if ( luceneNodes.count() )
586 andNode.subNodes += SearchNode( term, SearchNode::Lucene, luceneNodes );
587 if ( sparqlNodes.count() )
588 andNode.subNodes += SearchNode( term, SearchNode::Sparql, sparqlNodes );
589 if ( unknownNodes.count() )
590 andNode.subNodes += SearchNode( term, SearchNode::Unknown, unknownNodes );
591 return andNode;
595 default:
596 // Q_ASSERT_X( 0, "splitLuceneSparql", "invalid term" );
597 return SearchNode( Term() );
602 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::search( const SearchNode& node, double baseScore, bool reportResults )
604 if ( node.type == SearchNode::Lucene ) {
605 return luceneQuery( createLuceneQuery( node ), baseScore, reportResults );
607 else if ( node.type == SearchNode::Sparql ) {
608 return sparqlQuery( createSparqlQuery( node ), baseScore, reportResults );
610 else if ( node.term.type() == Term::AndTerm ) {
611 return andSearch( node.subNodes, baseScore, reportResults );
613 else {
614 return orSearch( node.subNodes, baseScore, reportResults );
619 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::andSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
621 QHash<QUrl, Result> results;
622 bool first = true;
623 foreach( const SearchNode& node, nodes ) {
624 if ( m_canceled ) break;
625 // FIXME: the search will restrict the number of results to maxResults although
626 // after the merge we might have less
627 QHash<QUrl, Result> termResults = search( node, baseScore, false );
628 if ( first ) {
629 results = termResults;
630 first = false;
632 else {
633 // intersect the results
634 // FIXME: sort by score
635 QHash<QUrl, Result>::iterator it = results.begin();
636 while ( it != results.end() ) {
637 if ( m_canceled ) break;
638 QHash<QUrl, Result>::const_iterator termIt = termResults.constFind( it.key() );
639 if ( termIt != termResults.constEnd() ) {
640 // update score
641 it.value().setScore( it.value().score() + termIt.value().score() );
642 ++it;
644 else {
645 it = results.erase( it );
651 if ( reportResults ) {
652 for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
653 it != results.constEnd(); ++it ) {
654 if ( m_canceled ) break;
655 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
656 return results;
658 else {
659 ++m_numResults;
660 emit newResult( it.value() );
665 return results;
669 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::orSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
671 QHash<QUrl, Result> results;
672 foreach( const SearchNode& node, nodes ) {
673 if ( m_canceled ) break;
674 // FIXME: sort by score, ie. use the maxResults results with the highest score
675 mergeInResults( results, search( node, baseScore, reportResults ) );
677 if ( reportResults ) {
678 for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
679 it != results.constEnd(); ++it ) {
680 if ( m_canceled ) break;
681 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
682 return results;
684 else {
685 ++m_numResults;
686 emit newResult( it.value() );
690 return results;
694 QList<QUrl> Nepomuk::Search::SearchThread::matchFieldName( const QString& field )
696 kDebug() << field;
698 QList<QUrl> results;
700 // Step 1: see if we have a direct match to a predicate label
701 // there is no need in selecting unused properties
702 QString query = QString( "select distinct ?p where { "
703 "?p <%1> <%2> . "
704 "?p <%3> \"%4\"^^<%5> . "
705 "?x ?p ?y . }" )
706 .arg( Soprano::Vocabulary::RDF::type().toString() )
707 .arg( Soprano::Vocabulary::RDF::Property().toString() )
708 .arg( Soprano::Vocabulary::RDFS::label().toString() )
709 .arg( field )
710 .arg( Soprano::Vocabulary::XMLSchema::string().toString() );
711 kDebug() << "Direct match query:" << query;
713 Soprano::QueryResultIterator labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
714 Soprano::Query::QueryLanguageSparql );
715 if ( !m_canceled ) {
716 while ( labelHits.next() ) {
717 results << labelHits.binding( "p" ).uri();
718 kDebug() << "Found direct match" << labelHits.binding( "p" ).uri();
721 if ( results.isEmpty() ) {
722 // FIXME: how about we have two repositories: one for the ontologies and one for the data.
723 // I don't think there will be relations between the RDF or Xesam ontology and some
724 // metadata....
725 // Because then queries like the one we are doing here will be more performant since
726 // we do not search the data itself and do not have to filter
727 // BUT: What about inference?
729 query = QString( "select ?p where { "
730 "?p <%1> <%2> . "
731 "?p <%3> ?label . "
732 "FILTER(REGEX(STR(?label),'%4','i')) . }" )
733 .arg( Soprano::Vocabulary::RDF::type().toString() )
734 .arg( Soprano::Vocabulary::RDF::Property().toString() )
735 .arg( Soprano::Vocabulary::RDFS::label().toString() )
736 .arg( field );
737 kDebug() << "Indirect hit query:" << query;
738 labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
739 Soprano::Query::QueryLanguageSparql );
740 QString newQuery;
741 while ( labelHits.next() ) {
742 results << labelHits.binding( "p" ).uri();
743 kDebug() << "Found indirect match by label" << labelHits.binding( "p" ).uri();
748 if ( results.isEmpty() ) {
749 query = QString( "select ?p where { "
750 "?p <%1> <%2> . "
751 "FILTER(REGEX(STR(?p),'%3','i')) . }" )
752 .arg( Soprano::Vocabulary::RDF::type().toString() )
753 .arg( Soprano::Vocabulary::RDF::Property().toString() )
754 .arg( field );
755 kDebug() << "Indirect hit query:" << query;
756 labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
757 Soprano::Query::QueryLanguageSparql );
758 QString newQuery;
759 while ( labelHits.next() ) {
760 results << labelHits.binding( "p" ).uri();
761 kDebug() << "Found indirect match by name" << labelHits.binding( "p" ).uri();
766 return results;
770 QString Nepomuk::Search::SearchThread::createSparqlQuery( const Nepomuk::Search::SearchNode& node )
772 int varCnt = 0;
773 return QString( "select distinct ?r %1 where { graph ?g { ?r a ?type . } . ?g a <%2> . %3 %4 }" )
774 .arg( buildRequestPropertyVariableList() )
775 .arg( Soprano::Vocabulary::NRL::InstanceBase().toString() )
776 .arg( createGraphPattern( node, varCnt ) )
777 .arg( buildRequestPropertyPatterns() );
781 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::sparqlQuery( const QString& query, double baseScore, bool reportResults )
783 kDebug() << query;
785 QHash<QUrl, Result> results;
787 Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query, Soprano::Query::QueryLanguageSparql );
788 while ( hits.next() ) {
789 if ( m_canceled ) break;
791 Result result = extractResult( hits );
792 result.setScore( baseScore );
794 kDebug() << "Found result:" << result.resourceUri();
796 // these are actual direct hits and we can report them right away
797 if ( reportResults ) {
798 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
799 return results;
801 else {
802 ++m_numResults;
803 emit newResult( result );
807 results.insert( result.resourceUri(), result );
810 return results;
814 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::luceneQuery( const QString& query, double baseScore, bool reportResults )
816 QString finalQuery( query );
818 // if Soprano is 2.1.64 or newer the storage service does force the indexing or rdf:type which means that
819 // we can query it via lucene queries
820 // normally for completeness we would have to exclude all the owl and nrl properties but that would make
821 // for way to long queries and this should cover most cases anyway
822 // since we do not have inference we even need to check subclasses
823 #if SOPRANO_IS_VERSION(2,1,64)
824 finalQuery += QString(" AND NOT %1:%2 AND NOT %1:%3 AND NOT %1:%4 AND NOT %1:%5 AND NOT %1:%6 AND NOT %1:%7")
825 .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::type()) )
826 .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::Property()) )
827 .arg( luceneQueryEscape(Soprano::Vocabulary::RDFS::Class()) )
828 .arg( luceneQueryEscape(Soprano::Vocabulary::OWL::Class()) )
829 .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::InstanceBase()) )
830 .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::Ontology()) )
831 .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::KnowledgeBase()) );
832 #endif
834 kDebug() << finalQuery;
836 Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( finalQuery,
837 Soprano::Query::QueryLanguageUser,
838 "lucene" );
839 QHash<QUrl, Result> results;
841 while ( hits.next() ) {
842 if ( m_canceled ) break;
844 QUrl hitUri = hits.binding( 0 ).uri();
845 double hitScore = hits.binding( 1 ).literal().toDouble() * baseScore;
847 if ( hitScore >= cutOffScore() ) {
848 Result result( hitUri, hitScore );
850 if ( !m_searchTerm.requestProperties().isEmpty() ) {
851 // FIXME: when merging with results from sparqlQuery there is no need to fetch them twice!
852 fetchRequestPropertiesForResource( result );
855 // these are actual direct hits and we can report them right away
856 if ( reportResults ) {
857 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
858 return results;
860 else {
861 ++m_numResults;
862 kDebug() << "direct hit:" << hitUri << hitScore;
863 emit newResult( result );
867 results.insert( hitUri, result );
869 else {
870 kDebug() << "Score too low:" << hitUri << hitScore;
874 return results;
878 QString Nepomuk::Search::SearchThread::buildRequestPropertyVariableList() const
880 int numRequestProperties = m_searchTerm.requestProperties().count();
881 QString s;
882 for ( int i = 1; i <= numRequestProperties; ++i ) {
883 s += QString( "?reqProp%1 " ).arg( i );
885 return s;
889 QString Nepomuk::Search::SearchThread::buildRequestPropertyPatterns() const
891 QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
892 QString s;
893 int i = 1;
894 foreach ( const Query::RequestProperty& rp, requestProperties ) {
895 if ( rp.second ) {
896 s += "OPTIONAL { ";
899 s += QString( "?r <%1> ?reqProp%2 . " ).arg( QString::fromAscii( rp.first.toEncoded() ) ).arg( i++ );
901 if ( rp.second ) {
902 s += "} ";
905 return s;
909 Nepomuk::Search::Result Nepomuk::Search::SearchThread::extractResult( const Soprano::QueryResultIterator& it ) const
911 Result result( it.binding( 0 ).uri() );
913 int i = 1;
914 QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
915 foreach ( const Query::RequestProperty& rp, requestProperties ) {
916 result.addRequestProperty( rp.first, it.binding( QString("reqProp%1").arg( i++ ) ) );
919 // score will be set above
920 return result;
924 void Nepomuk::Search::SearchThread::fetchRequestPropertiesForResource( Result& result )
926 QString q = QString( "select distinct %1 where { %2 }" )
927 .arg( buildRequestPropertyVariableList() )
928 .arg( buildRequestPropertyPatterns().replace( "?r ", '<' + QString::fromAscii( result.resourceUri().toEncoded() ) + "> " ) );
929 kDebug() << q;
930 Soprano::QueryResultIterator reqPropHits = ResourceManager::instance()->mainModel()->executeQuery( q, Soprano::Query::QueryLanguageSparql );
931 if ( reqPropHits.next() ) {
932 int i = 1;
933 QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
934 foreach ( const Query::RequestProperty& rp, requestProperties ) {
935 result.addRequestProperty( rp.first, reqPropHits.binding( QString("reqProp%1").arg( i++ ) ) );
940 #include "searchthread.moc"