runtime/nepomuk/services/queryservice/searchthread.cpp

   1 /*
   2   This file is part of the Nepomuk KDE project.
   3   Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
   4
   5   This library is free software; you can redistribute it and/or
   6   modify it under the terms of the GNU Library General Public
   7   License version 2 as published by the Free Software Foundation.
   8
   9   This library is distributed in the hope that it will be useful,
  10   but WITHOUT ANY WARRANTY; without even the implied warranty of
  11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12   Library General Public License for more details.
  13
  14   You should have received a copy of the GNU Library General Public License
  15   along with this library; see the file COPYING.LIB.  If not, write to
  16   the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  17   Boston, MA 02110-1301, USA.
  18 */
  19
  20 #include "searchthread.h"
  21 #include "term.h"
  22 #include "qurlhash.h"
  23
  24 #include <Nepomuk/ResourceManager>
  25 #include <Nepomuk/Resource>
  26 #include <Nepomuk/Types/Property>
  27 #include <Nepomuk/Types/Class>
  28 #include <Nepomuk/Types/Literal>
  29
  30 #include <Soprano/Version>
  31 #include <Soprano/Model>
  32 #include <Soprano/QueryResultIterator>
  33 #include <Soprano/Node>
  34 #include <Soprano/Statement>
  35 #include <Soprano/LiteralValue>
  36 #include <Soprano/StatementIterator>
  37 #include <Soprano/Vocabulary/RDF>
  38 #include <Soprano/Vocabulary/RDFS>
  39 #include <Soprano/Vocabulary/NRL>
  40 #include <Soprano/Vocabulary/NAO>
  41 #include <Soprano/Vocabulary/XMLSchema>
  42 #include <Soprano/Vocabulary/OWL>
  43
  44 #include <KDebug>
  45
  46 #include <QtCore/QTime>
  47
  48
  49
  50 // FIXME: With our cutoff score we might miss results that are hit multiple times and thus, would get their
  51 //        score increased
  52
  53 #warning Make query optimization methods return an invalid term if the query cannot be resolved and handle this as no results
  54
  55 using namespace Soprano;
  56
  57 namespace {
  58     /**
  59      * The maximum number of resources that are matched in resolveValues when converting
  60      * an equals or contains term.
  61      */
  62     const int MAX_RESOURCES = 4;
  63
  64
  65     void mergeInResult( QHash<QUrl, Nepomuk::Search::Result>& results, const Nepomuk::Search::Result& resource ) {
  66         QHash<QUrl, Nepomuk::Search::Result>::iterator old = results.find( resource.resourceUri() );
  67         if ( old == results.end() ) {
  68             results.insert( resource.resourceUri(), resource );
  69         }
  70         else {
  71             // FIXME: how do we join the scores properly? Is adding a good idea? It can certainly not be multiplication!
  72             Nepomuk::Search::Result& result = *old;
  73             result.setScore( result.score() + resource.score() );
  74         }
  75     }
  76
  77     void mergeInResults( QHash<QUrl, Nepomuk::Search::Result>& results, const QHash<QUrl, Nepomuk::Search::Result>& otherResults ) {
  78         for ( QHash<QUrl, Nepomuk::Search::Result>::const_iterator it = otherResults.constBegin();
  79               it != otherResults.constEnd(); ++it ) {
  80             mergeInResult( results, it.value() );
  81         }
  82     }
  83
  84     // This is a copy of Soprano::Index::IndexFilterModel::encodeStringForLuceneQuery
  85     // which we do not use to prevent linking to sopranoindex
  86     QString luceneQueryEscape( const QString& s ) {
  87         /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~  : \ */
  88
  89         static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
  90         QString es( s );
  91         es.replace( rx, "\\\\1" );
  92         return es;
  93     }
  94
  95     QString luceneQueryEscape( const QUrl& s ) {
  96         return luceneQueryEscape( QString::fromAscii( s.toEncoded() ) );
  97     }
  98
  99     QString createLuceneLiteralQuery( const QString& escaped ) {
 100         if ( escaped.contains( QRegExp( "\\s" ) ) ) {
 101             return "\"" + escaped + "\"";
 102         }
 103         else {
 104             return escaped;
 105         }
 106     }
 107
 108     QString createLuceneQuery( const Nepomuk::Search::SearchNode& node ) {
 109         if ( node.term.type() == Nepomuk::Search::Term::LiteralTerm ) {
 110             return createLuceneLiteralQuery( luceneQueryEscape( node.term.value().toString() ) );
 111         }
 112         else if ( node.term.type() == Nepomuk::Search::Term::ComparisonTerm ) {
 113             return luceneQueryEscape( node.term.property() ) + ':' + createLuceneLiteralQuery( luceneQueryEscape( node.term.subTerms().first().value().toString() ) );
 114         }
 115         else {
 116             Q_ASSERT( node.term.type() == Nepomuk::Search::Term::AndTerm ||
 117                       node.term.type() == Nepomuk::Search::Term::OrTerm );
 118
 119             QStringList sq;
 120             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
 121                 sq += createLuceneQuery( n );
 122             }
 123             if ( node.term.type() == Nepomuk::Search::Term::AndTerm ) {
 124                 return " ( " + sq.join( " AND " ) + " ) ";
 125             }
 126             else {
 127                 return " ( " + sq.join( " OR " ) + " ) ";
 128             }
 129         }
 130     }
 131
 132     QString comparatorString( Nepomuk::Search::Term::Comparator c ) {
 133         switch( c ) {
 134         case Nepomuk::Search::Term::Contains:
 135             return ":";
 136         case Nepomuk::Search::Term::Equal:
 137             return "=";
 138         case Nepomuk::Search::Term::Greater:
 139             return ">";
 140         case Nepomuk::Search::Term::Smaller:
 141             return "<";
 142         case Nepomuk::Search::Term::GreaterOrEqual:
 143             return ">=";
 144         case Nepomuk::Search::Term::SmallerOrEqual:
 145             return "<=";
 146         }
 147         // make gcc happy
 148         return QString();
 149     }
 150
 151
 152     bool isNumberLiteralValue( const Soprano::LiteralValue& value ) {
 153         return value.isInt() || value.isInt64() || value.isUnsignedInt() || value.isUnsignedInt64() || value.isDouble();
 154     }
 155
 156
 157     QString createGraphPattern( const Nepomuk::Search::SearchNode& node, int& varCnt, const QString& varName = QString( "?r" ) )
 158     {
 159         switch( node.term.type() ) {
 160         case Nepomuk::Search::Term::ComparisonTerm: {
 161
 162             Nepomuk::Search::Term subTerm( node.term.subTerms().first() );
 163
 164             //
 165             // is the subterm (we only support one ATM) a final term (no further subterms)
 166             // -> actually match the literal or resource
 167             //
 168             if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ||
 169                  subTerm.type() == Nepomuk::Search::Term::LiteralTerm ) {
 170                 if( node.term.comparator() != Nepomuk::Search::Term::Equal ) {
 171                     // For numbers there is no need for quotes + this way we can handle all the xsd decimal types
 172                     // FIXME: it may be necessary to escape stuff
 173                     QString filter = QString( "?var%1 %2 " )
 174                                      .arg( ++varCnt )
 175                                      .arg( comparatorString( node.term.comparator() ) );
 176                     if ( isNumberLiteralValue( subTerm.value() ) ) {
 177                         filter += subTerm.value().toString();
 178                     }
 179                     else {
 180                         Nepomuk::Types::Property prop( node.term.property() );
 181                         filter += QString( "\"%1\"" ).arg( subTerm.value().toString() );
 182                         if ( prop.literalRangeType().dataTypeUri().isValid() )
 183                             filter += QString( "^^<%1>" ).arg( prop.literalRangeType().dataTypeUri().toString() );
 184                     }
 185
 186                     return QString( "%1 <%2> ?var%3 . FILTER(%4) . " )
 187                         .arg( varName )
 188                         .arg( QString::fromAscii( node.term.property().toEncoded() ) )
 189                         .arg( varCnt )
 190                         .arg( filter );
 191                 }
 192                 else {
 193                     if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ) {
 194                         return QString( "%1 <%2> <%3> . " )
 195                             .arg( varName )
 196                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
 197                             .arg( QString::fromAscii( subTerm.resource().toEncoded() ) );
 198                     }
 199                     else if ( Nepomuk::Types::Property( node.term.property() ).range().isValid() ) {
 200                         return QString( "%7 <%1> ?x . { ?x <%2> \"%3\"^^<%4> . } UNION { ?x <%5> \"%3\"^^<%4>.  } UNION { ?x <%6> \"%3\"^^<%4> . }" )
 201                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
 202                             .arg( Soprano::Vocabulary::RDFS::label().toString() )
 203                             .arg( subTerm.value().toString() )
 204                             .arg( Soprano::Vocabulary::XMLSchema::string().toString() )
 205                             .arg( Soprano::Vocabulary::NAO::prefLabel().toString() )
 206                             .arg( Soprano::Vocabulary::NAO::identifier().toString() )
 207                             .arg( varName );
 208                     }
 209                     else {
 210                         return QString( "%1 <%2> \"%3\"^^<%4> . " )
 211                             .arg( varName )
 212                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
 213                             .arg( subTerm.value().toString() )
 214                             .arg( Nepomuk::Types::Property( node.term.property() ).literalRangeType().dataTypeUri().toString() );
 215                     }
 216                 }
 217             }
 218
 219             //
 220             // Is the subterm not final, i.e. has further subterms
 221             // -> combine graph pattern with subterm graph pattern
 222             //
 223             else {
 224                 QString bridgeVarName = QString( "?var%1" ).arg( ++varCnt );
 225                 return QString( "%1 <%2> %3 . " )
 226                     .arg( varName )
 227                     .arg( QString::fromAscii( node.term.property().toEncoded() ) )
 228                     .arg( bridgeVarName )
 229                     + createGraphPattern( node.subNodes.first(), varCnt, bridgeVarName );
 230             }
 231         }
 232
 233         case Nepomuk::Search::Term::AndTerm: {
 234             QString s( "{ " );
 235             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
 236                 s += createGraphPattern( n, varCnt );
 237             }
 238             s += "} ";
 239             return s;
 240         }
 241
 242         case Nepomuk::Search::Term::OrTerm: {
 243             QStringList s;
 244             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
 245                 s += createGraphPattern( n, varCnt );
 246             }
 247             Q_ASSERT( !s.isEmpty() );
 248             return "{ " + s.join( " } UNION { " ) + " } ";
 249         }
 250
 251         default:
 252             Q_ASSERT_X( 0, "createGraphPattern", "unsupported Term type" );
 253         }
 254
 255         return QString();
 256     }
 257 }
 258
 259
 260 Nepomuk::Search::SearchThread::SearchThread( QObject* parent )
 261     : QThread( parent )
 262 {
 263 }
 264
 265
 266 Nepomuk::Search::SearchThread::~SearchThread()
 267 {
 268 }
 269
 270
 271 void Nepomuk::Search::SearchThread::query( const Query& term, double cutOffScore )
 272 {
 273     if( isRunning() ) {
 274         cancel();
 275     }
 276
 277     kDebug() << term << cutOffScore;
 278
 279     m_canceled = false;
 280     m_searchTerm = term;
 281     m_cutOffScore = cutOffScore;
 282     m_numResults = 0;
 283
 284     start();
 285 }
 286
 287
 288 void Nepomuk::Search::SearchThread::cancel()
 289 {
 290     m_canceled = true;
 291     wait();
 292 }
 293
 294
 295 void Nepomuk::Search::SearchThread::run()
 296 {
 297     QTime time;
 298     time.start();
 299
 300     if ( m_searchTerm.type() == Query::PlainQuery ) {
 301         kDebug() << "Plain Query:    " << m_searchTerm;
 302         Term t = resolveFields( m_searchTerm.term() );
 303         kDebug() << "Fields resolved:" << t;
 304         t = resolveValues( t );
 305         kDebug() << "Values resolved:" << t;
 306         t = optimize( t );
 307         kDebug() << "Optimized query:" << t;
 308
 309         search( splitLuceneSparql( t ) /*optimize( resolveValues( resolveFields( m_searchTerm ) ) )*/, 1.0, true );
 310     }
 311     else {
 312         // FIXME: once we have the Soprano query API it should be simple to add the requestProperties here
 313         // for now we do it the hacky way
 314         QString query = m_searchTerm.sparqlQuery();
 315         int pos = query.indexOf( QLatin1String( "where" ) );
 316         if ( pos > 0 ) {
 317             query.insert( pos, buildRequestPropertyVariableList() + ' ' );
 318             pos = query.lastIndexOf( '}' );
 319             if ( pos > 0 ) {
 320                 query.insert( pos, ' ' + buildRequestPropertyPatterns() + ' ' );
 321             }
 322         }
 323
 324         sparqlQuery( query, 1.0, true );
 325     }
 326
 327     kDebug() << time.elapsed();
 328 }
 329
 330
 331 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveFields( const Term& term )
 332 {
 333     switch( term.type() ) {
 334     case Term::AndTerm:
 335     case Term::OrTerm: {
 336         Term newTerm;
 337         newTerm.setType( term.type() );
 338         QList<Term> terms = term.subTerms();
 339         foreach( const Term& t, terms ) {
 340             if ( m_canceled ) break;
 341             newTerm.addSubTerm( resolveFields( t ) );
 342         }
 343         return newTerm;
 344     }
 345
 346
 347     case Term::ComparisonTerm: {
 348         Term newTerm( term );
 349         Term subTerm = term.subTerms().first();
 350         if ( subTerm.type() != Term::LiteralTerm &&
 351              subTerm.type() != Term::ResourceTerm ) {
 352             newTerm.setSubTerms( QList<Term>() << resolveFields( subTerm ) );
 353         }
 354
 355         if ( !newTerm.property().isValid() ) {
 356             // FIXME: use the score of the field search as boost factors
 357             QList<QUrl> properties = matchFieldName( term.field() );
 358             if ( properties.count() > 0 ) {
 359                 if ( properties.count() == 1 ) {
 360                     newTerm.setProperty( properties.first() );
 361                     return newTerm;
 362                 }
 363                 else {
 364                     Term orTerm;
 365                     orTerm.setType( Term::OrTerm );
 366                     foreach( const QUrl& property, properties ) {
 367                         Term t( newTerm );
 368                         t.setProperty( property );
 369                         orTerm.addSubTerm( t );
 370                     }
 371                     return orTerm;
 372                 }
 373             }
 374             else {
 375                 kDebug() << "Failed to resolve field" << term.field() << "to any property!";
 376                 return Term();
 377             }
 378         }
 379     }
 380
 381     default:
 382         return term;
 383     }
 384 }
 385
 386
 387 // precondition: resolveFields needs to be run before this one as it only touches properties
 388 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveValues( const Term& term )
 389 {
 390     switch( term.type() ) {
 391     case Term::AndTerm:
 392     case Term::OrTerm: {
 393         Term newTerm;
 394         newTerm.setType( term.type() );
 395         QList<Term> terms = term.subTerms();
 396         foreach( const Term& t, terms ) {
 397             if ( m_canceled ) break;
 398             newTerm.addSubTerm( resolveValues( t ) );
 399         }
 400         return newTerm;
 401     }
 402
 403
 404     case Term::ComparisonTerm: {
 405         // FIXME: we could also handle this via lucene for literals but what is better?
 406         // with lucene we have the additional work of getting the requestProperties
 407
 408         // FIXME: handle subqueries
 409
 410         //
 411         // ComparisonTerm Terms can contain subterms that again. We do not support
 412         // arbitrary subterms but only comparator terms. Here we will only resolve the
 413         // last one since all others will be handled in a single SPARQL query.
 414         //
 415         // Also, non-comtains comparators are handled in the SPARQL query as well.
 416         //
 417         // Thus, in the end we only resolve literal contains terms.
 418         //
 419         if ( term.comparator() == Term::Contains &&
 420              term.subTerms().first().type() == Term::LiteralTerm ) {
 421
 422             Q_ASSERT ( term.property().isValid() );
 423
 424             // we only need to augment terms that have a property with
 425             // a non-literal range. These will never hit in a lucene query
 426             // anyway
 427             Nepomuk::Types::Property prop( term.property() );
 428             if ( prop.range().isValid() ) {
 429
 430                 Term orTerm;
 431                 orTerm.setType( Term::OrTerm );
 432
 433                 // FIXME: cache the results as it is very well possible that we search the same multiple times
 434                 // if resolveFields did create an OR term
 435
 436                 // rdfs:label has a higher priority than any other property
 437                 // TODO: without being able to query the resource type simple searching for term.value() is waaaaay to slow
 438                 //QString query = QString( "%1:\"%2\"^4 \"%2\"" )
 439                 QString query = QString( "%1:\"%2\" OR %3:\"%2\" OR %4:\"%2\"" )
 440                                 .arg( luceneQueryEscape( Soprano::Vocabulary::RDFS::label() ) )
 441                                 .arg( term.subTerms().first().value().toString() )
 442                                 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::prefLabel() ) )
 443                                 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::identifier() ) );
 444                 Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query,
 445                                                                                                             Soprano::Query::QueryLanguageUser,
 446                                                                                                             "lucene" );
 447
 448                 while ( hits.next() ) {
 449                     if ( m_canceled ) break;
 450
 451                     // FIXME: use the lucene score as boost factor
 452                     QUrl hit = hits.binding( 0 ).uri();
 453                     if ( prop.range().uri() == Soprano::Vocabulary::RDFS::Resource() ||
 454                          Nepomuk::Resource( hit ).hasType( prop.range().uri() ) ) {
 455                         orTerm.addSubTerm( Term( term.property(), hit ) );
 456                         if ( orTerm.subTerms().count() == MAX_RESOURCES ) {
 457                             break;
 458                         }
 459                     }
 460                 }
 461
 462                 if ( orTerm.subTerms().count() == 1 ) {
 463                     return orTerm.subTerms().first();
 464                 }
 465                 else if ( orTerm.subTerms().count() ) {
 466                     return orTerm;
 467                 }
 468                 else {
 469                     kDebug() << "Failed to match value" << term.subTerms().first().value() << "to any possible resource.";
 470                     return term;
 471                 }
 472             }
 473             else {
 474                 // nothing to do here
 475                 return term;
 476             }
 477         }
 478
 479         // non-literal term or non-contains term -> handled in SPARQL query
 480         else {
 481             Term newTerm( term );
 482             newTerm.setSubTerms( QList<Term>() << resolveValues( term.subTerms().first() ) );
 483             return newTerm;
 484         }
 485     }
 486
 487     default:
 488         return term;
 489     }
 490 }
 491
 492
 493 Nepomuk::Search::Term Nepomuk::Search::SearchThread::optimize( const Term& term )
 494 {
 495     switch( term.type() ) {
 496     case Term::AndTerm:
 497     case Term::OrTerm: {
 498         QList<Term> subTerms = term.subTerms();
 499         QList<Term> newSubTerms;
 500         QList<Term>::const_iterator end( subTerms.constEnd() );
 501         for ( QList<Term>::const_iterator it = subTerms.constBegin();
 502               it != end; ++it ) {
 503             const Term& t = *it;
 504             Term ot = optimize( t );
 505             if ( ot.type() == term.type() ) {
 506                 newSubTerms += ot.subTerms();
 507             }
 508             else {
 509                 newSubTerms += ot;
 510             }
 511         }
 512         Term newTerm;
 513         newTerm.setType( term.type() );
 514         newTerm.setSubTerms( newSubTerms );
 515         return newTerm;
 516     }
 517
 518     default:
 519         return term;
 520     }
 521 }
 522
 523
 524 Nepomuk::Search::SearchNode Nepomuk::Search::SearchThread::splitLuceneSparql( const Term& term )
 525 {
 526     // Goal: separate the terms into 2 groups: literal and resource which are
 527     // merged with only one AND or OR action. Is that possible?
 528
 529     // For now we will do this (our query lang does not handle nested queries anyway)
 530     // LiteralTerm    -> one lucene, no sparql
 531     // ComparisonTerm -> one lucene, no sparql (resource contains will be resolved to equality above)
 532     // AndTerm        -> divide all subterms and create two "small" AND terms
 533     // OrTerm         -> divide all subterms and create two "small" OR terms
 534
 535     switch( term.type() ) {
 536     case Term::LiteralTerm:
 537         return SearchNode( term, SearchNode::Lucene );
 538
 539     case Term::ComparisonTerm:
 540         if ( term.comparator() == Term::Contains &&
 541              term.subTerms().first().type() == Term::LiteralTerm ) {
 542             // no need for subnides here - we only use the subterm's value
 543             return SearchNode( term, SearchNode::Lucene );
 544         }
 545         else {
 546             // all subnodes are resolved and can be handled in a SPARQL query
 547             SearchNode node( term, SearchNode::Sparql );
 548             node.subNodes += splitLuceneSparql( term.subTerms().first() );
 549             return node;
 550         }
 551
 552     case Term::AndTerm:
 553     case Term::OrTerm: {
 554         QList<Term> subTerms = term.subTerms();
 555         QList<SearchNode> luceneNodes, sparqlNodes, unknownNodes;
 556
 557         QList<Term>::const_iterator end( subTerms.constEnd() );
 558         for ( QList<Term>::const_iterator it = subTerms.constBegin();
 559               it != end; ++it ) {
 560             SearchNode node = splitLuceneSparql( *it );
 561             if ( node.type == SearchNode::Lucene ) {
 562                 luceneNodes += node;
 563             }
 564             else if ( node.type == SearchNode::Sparql ) {
 565                 sparqlNodes += node;
 566             }
 567             else {
 568                 unknownNodes += node;
 569             }
 570         }
 571
 572         if ( luceneNodes.count() && !sparqlNodes.count() && !unknownNodes.count() ) {
 573             return SearchNode( term, SearchNode::Lucene, luceneNodes );
 574         }
 575         else if ( !luceneNodes.count() && sparqlNodes.count() && !unknownNodes.count() ) {
 576             return SearchNode( term, SearchNode::Sparql, sparqlNodes );
 577         }
 578         else if ( !luceneNodes.count() && !sparqlNodes.count() && unknownNodes.count() ) {
 579             return SearchNode( term, SearchNode::Unknown, unknownNodes );
 580         }
 581         else {
 582             Term newTerm;
 583             newTerm.setType( term.type() );
 584             SearchNode andNode( newTerm );
 585             if ( luceneNodes.count() )
 586                 andNode.subNodes += SearchNode( term, SearchNode::Lucene, luceneNodes );
 587             if ( sparqlNodes.count() )
 588                 andNode.subNodes += SearchNode( term, SearchNode::Sparql, sparqlNodes );
 589             if ( unknownNodes.count() )
 590                 andNode.subNodes += SearchNode( term, SearchNode::Unknown, unknownNodes );
 591             return andNode;
 592         }
 593     }
 594
 595     default:
 596 //        Q_ASSERT_X( 0, "splitLuceneSparql", "invalid term" );
 597         return SearchNode( Term() );
 598     }
 599 }
 600
 601
 602 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::search( const SearchNode& node, double baseScore, bool reportResults )
 603 {
 604     if ( node.type == SearchNode::Lucene ) {
 605         return luceneQuery( createLuceneQuery( node ), baseScore, reportResults );
 606     }
 607     else if ( node.type == SearchNode::Sparql ) {
 608         return sparqlQuery( createSparqlQuery( node ), baseScore, reportResults );
 609     }
 610     else if ( node.term.type() == Term::AndTerm ) {
 611         return andSearch( node.subNodes, baseScore, reportResults );
 612     }
 613     else {
 614         return orSearch( node.subNodes, baseScore, reportResults );
 615     }
 616 }
 617
 618
 619 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::andSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
 620 {
 621     QHash<QUrl, Result> results;
 622     bool first = true;
 623     foreach( const SearchNode& node, nodes ) {
 624         if ( m_canceled ) break;
 625         // FIXME: the search will restrict the number of results to maxResults although
 626         //        after the merge we might have less
 627         QHash<QUrl, Result> termResults = search( node, baseScore, false );
 628         if ( first ) {
 629             results = termResults;
 630             first = false;
 631         }
 632         else {
 633             // intersect the results
 634             // FIXME: sort by score
 635             QHash<QUrl, Result>::iterator it = results.begin();
 636             while ( it != results.end() ) {
 637                 if ( m_canceled ) break;
 638                 QHash<QUrl, Result>::const_iterator termIt = termResults.constFind( it.key() );
 639                 if ( termIt != termResults.constEnd() ) {
 640                     // update score
 641                     it.value().setScore( it.value().score() + termIt.value().score() );
 642                     ++it;
 643                 }
 644                 else {
 645                     it = results.erase( it );
 646                 }
 647             }
 648         }
 649     }
 650
 651     if ( reportResults ) {
 652         for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
 653               it != results.constEnd(); ++it ) {
 654             if ( m_canceled ) break;
 655             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
 656                 return results;
 657             }
 658             else {
 659                 ++m_numResults;
 660                 emit newResult( it.value() );
 661             }
 662         }
 663     }
 664
 665     return results;
 666 }
 667
 668
 669 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::orSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
 670 {
 671     QHash<QUrl, Result> results;
 672     foreach( const SearchNode& node, nodes ) {
 673         if ( m_canceled ) break;
 674         // FIXME: sort by score, ie. use the maxResults results with the highest score
 675         mergeInResults( results, search( node, baseScore, reportResults ) );
 676     }
 677     if ( reportResults ) {
 678         for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
 679               it != results.constEnd(); ++it ) {
 680             if ( m_canceled ) break;
 681             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
 682                 return results;
 683             }
 684             else {
 685                 ++m_numResults;
 686                 emit newResult( it.value() );
 687             }
 688         }
 689     }
 690     return results;
 691 }
 692
 693
 694 QList<QUrl> Nepomuk::Search::SearchThread::matchFieldName( const QString& field )
 695 {
 696     kDebug() << field;
 697
 698     QList<QUrl> results;
 699
 700     // Step 1: see if we have a direct match to a predicate label
 701     //         there is no need in selecting unused properties
 702     QString query = QString( "select distinct ?p where { "
 703                              "?p <%1> <%2> . "
 704                              "?p <%3> \"%4\"^^<%5> . "
 705                              "?x ?p ?y . }" )
 706                     .arg( Soprano::Vocabulary::RDF::type().toString() )
 707                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
 708                     .arg( Soprano::Vocabulary::RDFS::label().toString() )
 709                     .arg( field )
 710                     .arg( Soprano::Vocabulary::XMLSchema::string().toString() );
 711     kDebug() << "Direct match query:" << query;
 712
 713     Soprano::QueryResultIterator labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
 714                                                                                                      Soprano::Query::QueryLanguageSparql );
 715     if ( !m_canceled ) {
 716         while ( labelHits.next() ) {
 717             results << labelHits.binding( "p" ).uri();
 718             kDebug() << "Found direct match" << labelHits.binding( "p" ).uri();
 719         }
 720
 721         if ( results.isEmpty() ) {
 722             // FIXME: how about we have two repositories: one for the ontologies and one for the data.
 723             //        I don't think there will be relations between the RDF or Xesam ontology and some
 724             //        metadata....
 725             //        Because then queries like the one we are doing here will be more performant since
 726             //        we do not search the data itself and do not have to filter
 727             // BUT: What about inference?
 728
 729             query = QString( "select ?p where { "
 730                              "?p <%1> <%2> . "
 731                              "?p <%3> ?label . "
 732                              "FILTER(REGEX(STR(?label),'%4','i')) . }" )
 733                     .arg( Soprano::Vocabulary::RDF::type().toString() )
 734                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
 735                     .arg( Soprano::Vocabulary::RDFS::label().toString() )
 736                     .arg( field );
 737             kDebug() << "Indirect hit query:" << query;
 738             labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
 739                                                                                 Soprano::Query::QueryLanguageSparql );
 740             QString newQuery;
 741             while ( labelHits.next() ) {
 742                 results << labelHits.binding( "p" ).uri();
 743                 kDebug() << "Found indirect match by label" << labelHits.binding( "p" ).uri();
 744             }
 745         }
 746
 747
 748         if ( results.isEmpty() ) {
 749             query = QString( "select ?p where { "
 750                              "?p <%1> <%2> . "
 751                              "FILTER(REGEX(STR(?p),'%3','i')) . }" )
 752                     .arg( Soprano::Vocabulary::RDF::type().toString() )
 753                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
 754                     .arg( field );
 755             kDebug() << "Indirect hit query:" << query;
 756             labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
 757                                                                                 Soprano::Query::QueryLanguageSparql );
 758             QString newQuery;
 759             while ( labelHits.next() ) {
 760                 results << labelHits.binding( "p" ).uri();
 761                 kDebug() << "Found indirect match by name" << labelHits.binding( "p" ).uri();
 762             }
 763         }
 764     }
 765
 766     return results;
 767 }
 768
 769
 770 QString Nepomuk::Search::SearchThread::createSparqlQuery( const Nepomuk::Search::SearchNode& node )
 771 {
 772     int varCnt = 0;
 773     return QString( "select distinct ?r %1 where { graph ?g { ?r a ?type . } . ?g a <%2> . %3 %4 }" )
 774         .arg( buildRequestPropertyVariableList() )
 775         .arg( Soprano::Vocabulary::NRL::InstanceBase().toString() )
 776         .arg( createGraphPattern( node, varCnt ) )
 777         .arg( buildRequestPropertyPatterns() );
 778 }
 779
 780
 781 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::sparqlQuery( const QString& query, double baseScore, bool reportResults )
 782 {
 783     kDebug() << query;
 784
 785     QHash<QUrl, Result> results;
 786
 787     Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query, Soprano::Query::QueryLanguageSparql );
 788     while ( hits.next() ) {
 789         if ( m_canceled ) break;
 790
 791         Result result = extractResult( hits );
 792         result.setScore( baseScore );
 793
 794         kDebug() << "Found result:" << result.resourceUri();
 795
 796         // these are actual direct hits and we can report them right away
 797         if ( reportResults ) {
 798             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
 799                 return results;
 800             }
 801             else {
 802                 ++m_numResults;
 803                 emit newResult( result );
 804             }
 805         }
 806
 807         results.insert( result.resourceUri(), result );
 808     }
 809
 810     return results;
 811 }
 812
 813
 814 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::luceneQuery( const QString& query, double baseScore, bool reportResults )
 815 {
 816     QString finalQuery( query );
 817
 818     // if Soprano is 2.1.64 or newer the storage service does force the indexing or rdf:type which means that
 819     // we can query it via lucene queries
 820     // normally for completeness we would have to exclude all the owl and nrl properties but that would make
 821     // for way to long queries and this should cover most cases anyway
 822     // since we do not have inference we even need to check subclasses
 823 #if SOPRANO_IS_VERSION(2,1,64)
 824     finalQuery += QString(" AND NOT %1:%2 AND NOT %1:%3 AND NOT %1:%4 AND NOT %1:%5 AND NOT %1:%6 AND NOT %1:%7")
 825                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::type()) )
 826                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::Property()) )
 827                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDFS::Class()) )
 828                   .arg( luceneQueryEscape(Soprano::Vocabulary::OWL::Class()) )
 829                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::InstanceBase()) )
 830                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::Ontology()) )
 831                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::KnowledgeBase()) );
 832 #endif
 833
 834     kDebug() << finalQuery;
 835
 836     Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( finalQuery,
 837                                                                                                 Soprano::Query::QueryLanguageUser,
 838                                                                                                 "lucene" );
 839     QHash<QUrl, Result> results;
 840
 841     while ( hits.next() ) {
 842         if ( m_canceled ) break;
 843
 844         QUrl hitUri = hits.binding( 0 ).uri();
 845         double hitScore = hits.binding( 1 ).literal().toDouble() * baseScore;
 846
 847         if ( hitScore >= cutOffScore() ) {
 848             Result result( hitUri, hitScore );
 849
 850             if ( !m_searchTerm.requestProperties().isEmpty() ) {
 851                 // FIXME: when merging with results from sparqlQuery there is no need to fetch them twice!
 852                 fetchRequestPropertiesForResource( result );
 853             }
 854
 855             // these are actual direct hits and we can report them right away
 856             if ( reportResults ) {
 857                 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
 858                     return results;
 859                 }
 860                 else {
 861                     ++m_numResults;
 862                     kDebug() << "direct hit:" << hitUri << hitScore;
 863                     emit newResult( result );
 864                 }
 865             }
 866
 867             results.insert( hitUri, result );
 868         }
 869         else {
 870             kDebug() << "Score too low:" << hitUri << hitScore;
 871         }
 872     }
 873
 874     return results;
 875 }
 876
 877
 878 QString Nepomuk::Search::SearchThread::buildRequestPropertyVariableList() const
 879 {
 880     int numRequestProperties = m_searchTerm.requestProperties().count();
 881     QString s;
 882     for ( int i = 1; i <= numRequestProperties; ++i ) {
 883         s += QString( "?reqProp%1 " ).arg( i );
 884     }
 885     return s;
 886 }
 887
 888
 889 QString Nepomuk::Search::SearchThread::buildRequestPropertyPatterns() const
 890 {
 891     QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
 892     QString s;
 893     int i = 1;
 894     foreach ( const Query::RequestProperty& rp, requestProperties ) {
 895         if ( rp.second ) {
 896             s += "OPTIONAL { ";
 897         }
 898
 899         s += QString( "?r <%1> ?reqProp%2 . " ).arg( QString::fromAscii( rp.first.toEncoded() ) ).arg( i++ );
 900
 901         if ( rp.second ) {
 902             s += "} ";
 903         }
 904     }
 905     return s;
 906 }
 907
 908
 909 Nepomuk::Search::Result Nepomuk::Search::SearchThread::extractResult( const Soprano::QueryResultIterator& it ) const
 910 {
 911     Result result( it.binding( 0 ).uri() );
 912
 913     int i = 1;
 914     QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
 915     foreach ( const Query::RequestProperty& rp, requestProperties ) {
 916         result.addRequestProperty( rp.first, it.binding( QString("reqProp%1").arg( i++ ) ) );
 917     }
 918
 919     // score will be set above
 920     return result;
 921 }
 922
 923
 924 void Nepomuk::Search::SearchThread::fetchRequestPropertiesForResource( Result& result )
 925 {
 926     QString q = QString( "select distinct %1 where { %2 }" )
 927                 .arg( buildRequestPropertyVariableList() )
 928                 .arg( buildRequestPropertyPatterns().replace( "?r ", '<' + QString::fromAscii( result.resourceUri().toEncoded() ) + "> " ) );
 929     kDebug() << q;
 930     Soprano::QueryResultIterator reqPropHits = ResourceManager::instance()->mainModel()->executeQuery( q, Soprano::Query::QueryLanguageSparql );
 931     if ( reqPropHits.next() ) {
 932         int i = 1;
 933         QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
 934         foreach ( const Query::RequestProperty& rp, requestProperties ) {
 935             result.addRequestProperty( rp.first, reqPropHits.binding( QString("reqProp%1").arg( i++ ) ) );
 936         }
 937     }
 938 }
 939
 940 #include "searchthread.moc"