includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          * STUB
  25          *
  26          * @param $term String: raw search term
  27          * @return SearchResultSet
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          * STUB
  37          *
  38          * @param $term String: raw search term
  39          * @return SearchResultSet
  40          */
  41         function searchTitle( $term ) {
  42                 return null;
  43         }
  44
  45         /** If this search backend can list/unlist redirects */
  46         function acceptListRedirects() {
  47                 return true;
  48         }
  49
  50         /**
  51          * Transform search term in cases when parts of the query came as different GET params (when supported)
  52          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  53          */
  54         function transformSearchTerm( $term ) {
  55                 return $term;
  56         }
  57
  58         /**
  59          * If an exact title match can be find, or a very slightly close match,
  60          * return the title. If no match, returns NULL.
  61          *
  62          * @param $searchterm String
  63          * @return Title
  64          */
  65         public static function getNearMatch( $searchterm ) {
  66                 global $wgContLang;
  67
  68                 $allSearchTerms = array($searchterm);
  69
  70                 if($wgContLang->hasVariants()){
  71                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  72                 }
  73
  74                 foreach($allSearchTerms as $term){
  75
  76                         # Exact match? No need to look further.
  77                         $title = Title::newFromText( $term );
  78                         if (is_null($title))
  79                                 return NULL;
  80
  81                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  82                                 return $title;
  83                         }
  84
  85                         # See if it still otherwise has content is some sane sense
  86                         $article = MediaWiki::articleFromTitle( $title );
  87                         if( $article->hasViewableContent() ) {
  88                                 return $title;
  89                         }
  90
  91                         # Now try all lower case (i.e. first letter capitalized)
  92                         #
  93                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  94                         if ( $title && $title->exists() ) {
  95                                 return $title;
  96                         }
  97
  98                         # Now try capitalized string
  99                         #
 100                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 101                         if ( $title && $title->exists() ) {
 102                                 return $title;
 103                         }
 104
 105                         # Now try all upper case
 106                         #
 107                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 108                         if ( $title && $title->exists() ) {
 109                                 return $title;
 110                         }
 111
 112                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 113                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 114                         if ( $title && $title->exists() ) {
 115                                 return $title;
 116                         }
 117
 118                         // Give hooks a chance at better match variants
 119                         $title = null;
 120                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 121                                 return $title;
 122                         }
 123                 }
 124
 125                 $title = Title::newFromText( $searchterm );
 126
 127                 # Entering an IP address goes to the contributions page
 128                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 129                         || User::isIP( trim( $searchterm ) ) ) {
 130                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 131                 }
 132
 133
 134                 # Entering a user goes to the user page whether it's there or not
 135                 if ( $title->getNamespace() == NS_USER ) {
 136                         return $title;
 137                 }
 138
 139                 # Go to images that exist even if there's no local page.
 140                 # There may have been a funny upload, or it may be on a shared
 141                 # file repository such as Wikimedia Commons.
 142                 if( $title->getNamespace() == NS_FILE ) {
 143                         $image = wfFindFile( $title );
 144                         if( $image ) {
 145                                 return $title;
 146                         }
 147                 }
 148
 149                 # MediaWiki namespace? Page may be "implied" if not customized.
 150                 # Just return it, with caps forced as the message system likes it.
 151                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 152                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 153                 }
 154
 155                 # Quoted term? Try without the quotes...
 156                 $matches = array();
 157                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 158                         return SearchEngine::getNearMatch( $matches[1] );
 159                 }
 160
 161                 return NULL;
 162         }
 163
 164         public static function legalSearchChars() {
 165                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 166         }
 167
 168         /**
 169          * Set the maximum number of results to return
 170          * and how many to skip before returning the first.
 171          *
 172          * @param $limit Integer
 173          * @param $offset Integer
 174          */
 175         function setLimitOffset( $limit, $offset = 0 ) {
 176                 $this->limit = intval( $limit );
 177                 $this->offset = intval( $offset );
 178         }
 179
 180         /**
 181          * Set which namespaces the search should include.
 182          * Give an array of namespace index numbers.
 183          *
 184          * @param $namespaces Array
 185          */
 186         function setNamespaces( $namespaces ) {
 187                 $this->namespaces = $namespaces;
 188         }
 189
 190         /**
 191          * Parse some common prefixes: all (search everything)
 192          * or namespace names
 193          *
 194          * @param $query String
 195          */
 196         function replacePrefixes( $query ){
 197                 global $wgContLang;
 198
 199                 if( strpos($query,':') === false )
 200                         return $query; // nothing to do
 201
 202                 $parsed = $query;
 203                 $allkeyword = wfMsgForContent('searchall').":";
 204                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 205                         $this->namespaces = null;
 206                         $parsed = substr($query,strlen($allkeyword));
 207                 } else if( strpos($query,':') !== false ) {
 208                         $prefix = substr($query,0,strpos($query,':'));
 209                         $index = $wgContLang->getNsIndex($prefix);
 210                         if($index !== false){
 211                                 $this->namespaces = array($index);
 212                                 $parsed = substr($query,strlen($prefix)+1);
 213                         }
 214                 }
 215                 if(trim($parsed) == '')
 216                         return $query; // prefix was the whole query
 217
 218                 return $parsed;
 219         }
 220
 221         /**
 222          * Make a list of searchable namespaces and their canonical names.
 223          * @return Array
 224          */
 225         public static function searchableNamespaces() {
 226                 global $wgContLang;
 227                 $arr = array();
 228                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 229                         if( $ns >= NS_MAIN ) {
 230                                 $arr[$ns] = $name;
 231                         }
 232                 }
 233                 return $arr;
 234         }
 235
 236         /**
 237          * Extract default namespaces to search from the given user's
 238          * settings, returning a list of index numbers.
 239          *
 240          * @param $user User
 241          * @return Array
 242          */
 243         public static function userNamespaces( $user ) {
 244                 global $wgSearchEverythingOnlyLoggedIn;
 245
 246                 // get search everything preference, that can be set to be read for logged-in users
 247                 $searcheverything = false;
 248                 if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
 249                     || !$wgSearchEverythingOnlyLoggedIn )
 250                         $searcheverything = $user->getOption('searcheverything');
 251
 252                 // searcheverything overrides other options
 253                 if( $searcheverything )
 254                         return array_keys(SearchEngine::searchableNamespaces());
 255
 256                 $arr = Preferences::loadOldSearchNs( $user );
 257                 $searchableNamespaces = SearchEngine::searchableNamespaces();
 258
 259                 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
 260
 261                 return $arr;
 262         }
 263
 264         /**
 265          * Find snippet highlight settings for a given user
 266          *
 267          * @param $user User
 268          * @return Array contextlines, contextchars
 269          */
 270         public static function userHighlightPrefs( &$user ){
 271                 //$contextlines = $user->getOption( 'contextlines',  5 );
 272                 //$contextchars = $user->getOption( 'contextchars', 50 );
 273                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 274                 $contextchars = 75; // same as above.... :P
 275                 return array($contextlines, $contextchars);
 276         }
 277
 278         /**
 279          * An array of namespaces indexes to be searched by default
 280          *
 281          * @return Array
 282          */
 283         public static function defaultNamespaces(){
 284                 global $wgNamespacesToBeSearchedDefault;
 285
 286                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 287         }
 288
 289         /**
 290          * Get a list of namespace names useful for showing in tooltips
 291          * and preferences
 292          *
 293          * @param $namespaces Array
 294          */
 295         public static function namespacesAsText( $namespaces ){
 296                 global $wgContLang;
 297
 298                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 299                 foreach( $formatted as $key => $ns ){
 300                         if ( empty($ns) )
 301                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 302                 }
 303                 return $formatted;
 304         }
 305
 306         /**
 307          * Return the help namespaces to be shown on Special:Search
 308          *
 309          * @return Array
 310          */
 311         public static function helpNamespaces() {
 312                 global $wgNamespacesToBeSearchedHelp;
 313
 314                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
 315         }
 316
 317         /**
 318          * Return a 'cleaned up' search string
 319          *
 320          * @param $text String
 321          * @return String
 322          */
 323         function filter( $text ) {
 324                 $lc = $this->legalSearchChars();
 325                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 326         }
 327         /**
 328          * Load up the appropriate search engine class for the currently
 329          * active database backend, and return a configured instance.
 330          *
 331          * @return SearchEngine
 332          */
 333         public static function create() {
 334                 global $wgSearchType;
 335                 $dbr = wfGetDB( DB_SLAVE );
 336                 if( $wgSearchType ) {
 337                         $class = $wgSearchType;
 338                 } else {
 339                         $class = $dbr->getSearchEngine();
 340                 }
 341                 $search = new $class( $dbr );
 342                 $search->setLimitOffset(0,0);
 343                 return $search;
 344         }
 345
 346         /**
 347          * Create or update the search index record for the given page.
 348          * Title and text should be pre-processed.
 349          * STUB
 350          *
 351          * @param $id Integer
 352          * @param $title String
 353          * @param $text String
 354          */
 355         function update( $id, $title, $text ) {
 356                 // no-op
 357         }
 358
 359         /**
 360          * Update a search index record's title only.
 361          * Title should be pre-processed.
 362          * STUB
 363          *
 364          * @param $id Integer
 365          * @param $title String
 366          */
 367         function updateTitle( $id, $title ) {
 368                 // no-op
 369         }
 370
 371         /**
 372          * Get OpenSearch suggestion template
 373          *
 374          * @return String
 375          */
 376         public static function getOpenSearchTemplate() {
 377                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 378                 if( $wgOpenSearchTemplate )     {
 379                         return $wgOpenSearchTemplate;
 380                 } else {
 381                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 382                         if( !$ns ) $ns = "0";
 383                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 384                 }
 385         }
 386
 387         /**
 388          * Get internal MediaWiki Suggest template
 389          *
 390          * @return String
 391          */
 392         public static function getMWSuggestTemplate() {
 393                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 394                 if($wgMWSuggestTemplate)
 395                         return $wgMWSuggestTemplate;
 396                 else
 397                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 398         }
 399 }
 400
 401 /**
 402  * @ingroup Search
 403  */
 404 class SearchResultSet {
 405         /**
 406          * Fetch an array of regular expression fragments for matching
 407          * the search terms as parsed by this engine in a text extract.
 408          * STUB
 409          *
 410          * @return Array
 411          */
 412         function termMatches() {
 413                 return array();
 414         }
 415
 416         function numRows() {
 417                 return 0;
 418         }
 419
 420         /**
 421          * Return true if results are included in this result set.
 422          * STUB
 423          *
 424          * @return Boolean
 425          */
 426         function hasResults() {
 427                 return false;
 428         }
 429
 430         /**
 431          * Some search modes return a total hit count for the query
 432          * in the entire article database. This may include pages
 433          * in namespaces that would not be matched on the given
 434          * settings.
 435          *
 436          * Return null if no total hits number is supported.
 437          *
 438          * @return Integer
 439          */
 440         function getTotalHits() {
 441                 return null;
 442         }
 443
 444         /**
 445          * Some search modes return a suggested alternate term if there are
 446          * no exact hits. Returns true if there is one on this set.
 447          *
 448          * @return Boolean
 449          */
 450         function hasSuggestion() {
 451                 return false;
 452         }
 453
 454         /**
 455          * @return String: suggested query, null if none
 456          */
 457         function getSuggestionQuery(){
 458                 return null;
 459         }
 460
 461         /**
 462          * @return String: HTML highlighted suggested query, '' if none
 463          */
 464         function getSuggestionSnippet(){
 465                 return '';
 466         }
 467
 468         /**
 469          * Return information about how and from where the results were fetched,
 470          * should be useful for diagnostics and debugging
 471          *
 472          * @return String
 473          */
 474         function getInfo() {
 475                 return null;
 476         }
 477
 478         /**
 479          * Return a result set of hits on other (multiple) wikis associated with this one
 480          *
 481          * @return SearchResultSet
 482          */
 483         function getInterwikiResults() {
 484                 return null;
 485         }
 486
 487         /**
 488          * Check if there are results on other wikis
 489          *
 490          * @return Boolean
 491          */
 492         function hasInterwikiResults() {
 493                 return $this->getInterwikiResults() != null;
 494         }
 495
 496
 497         /**
 498          * Fetches next search result, or false.
 499          * STUB
 500          *
 501          * @return SearchResult
 502          */
 503         function next() {
 504                 return false;
 505         }
 506
 507         /**
 508          * Frees the result set, if applicable.
 509          */
 510         function free() {
 511                 // ...
 512         }
 513 }
 514
 515
 516 /**
 517  * @ingroup Search
 518  */
 519 class SearchResultTooMany {
 520         ## Some search engines may bail out if too many matches are found
 521 }
 522
 523
 524 /**
 525  * @todo Fixme: This class is horribly factored. It would probably be better to
 526  * have a useful base class to which you pass some standard information, then
 527  * let the fancy self-highlighters extend that.
 528  * @ingroup Search
 529  */
 530 class SearchResult {
 531         var $mRevision = null;
 532         var $mImage = null;
 533
 534         function __construct( $row ) {
 535                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 536                 if( !is_null($this->mTitle) ){
 537                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 538                         if( $this->mTitle->getNamespace() === NS_FILE )
 539                                 $this->mImage = wfFindFile( $this->mTitle );
 540                 }
 541         }
 542
 543         /**
 544          * Check if this is result points to an invalid title
 545          *
 546          * @return Boolean
 547          */
 548         function isBrokenTitle(){
 549                 if( is_null($this->mTitle) )
 550                         return true;
 551                 return false;
 552         }
 553
 554         /**
 555          * Check if target page is missing, happens when index is out of date
 556          *
 557          * @return Boolean
 558          */
 559         function isMissingRevision(){
 560                 return !$this->mRevision && !$this->mImage;
 561         }
 562
 563         /**
 564          * @return Title
 565          */
 566         function getTitle() {
 567                 return $this->mTitle;
 568         }
 569
 570         /**
 571          * @return Double or null if not supported
 572          */
 573         function getScore() {
 574                 return null;
 575         }
 576
 577         /**
 578          * Lazy initialization of article text from DB
 579          */
 580         protected function initText(){
 581                 if( !isset($this->mText) ){
 582                         if($this->mRevision != null)
 583                                 $this->mText = $this->mRevision->getText();
 584                         else // TODO: can we fetch raw wikitext for commons images?
 585                                 $this->mText = '';
 586
 587                 }
 588         }
 589
 590         /**
 591          * @param $terms Array: terms to highlight
 592          * @return String: highlighted text snippet, null (and not '') if not supported
 593          */
 594         function getTextSnippet($terms){
 595                 global $wgUser, $wgAdvancedSearchHighlighting;
 596                 $this->initText();
 597                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 598                 $h = new SearchHighlighter();
 599                 if( $wgAdvancedSearchHighlighting )
 600                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 601                 else
 602                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 603         }
 604
 605         /**
 606          * @param $terms Array: terms to highlight
 607          * @return String: highlighted title, '' if not supported
 608          */
 609         function getTitleSnippet($terms){
 610                 return '';
 611         }
 612
 613         /**
 614          * @param $terms Array: terms to highlight
 615          * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
 616          */
 617         function getRedirectSnippet($terms){
 618                 return '';
 619         }
 620
 621         /**
 622          * @return Title object for the redirect to this page, null if none or not supported
 623          */
 624         function getRedirectTitle(){
 625                 return null;
 626         }
 627
 628         /**
 629          * @return string highlighted relevant section name, null if none or not supported
 630          */
 631         function getSectionSnippet(){
 632                 return '';
 633         }
 634
 635         /**
 636          * @return Title object (pagename+fragment) for the section, null if none or not supported
 637          */
 638         function getSectionTitle(){
 639                 return null;
 640         }
 641
 642         /**
 643          * @return String: timestamp
 644          */
 645         function getTimestamp(){
 646                 if( $this->mRevision )
 647                         return $this->mRevision->getTimestamp();
 648                 else if( $this->mImage )
 649                         return $this->mImage->getTimestamp();
 650                 return '';
 651         }
 652
 653         /**
 654          * @return Integer: number of words
 655          */
 656         function getWordCount(){
 657                 $this->initText();
 658                 return str_word_count( $this->mText );
 659         }
 660
 661         /**
 662          * @return Integer: size in bytes
 663          */
 664         function getByteSize(){
 665                 $this->initText();
 666                 return strlen( $this->mText );
 667         }
 668
 669         /**
 670          * @return Boolean if hit has related articles
 671          */
 672         function hasRelated(){
 673                 return false;
 674         }
 675
 676         /**
 677          * @return String: interwiki prefix of the title (return iw even if title is broken)
 678          */
 679         function getInterwikiPrefix(){
 680                 return '';
 681         }
 682 }
 683
 684 /**
 685  * Highlight bits of wikitext
 686  *
 687  * @ingroup Search
 688  */
 689 class SearchHighlighter {
 690         var $mCleanWikitext = true;
 691
 692         function SearchHighlighter($cleanupWikitext = true){
 693                 $this->mCleanWikitext = $cleanupWikitext;
 694         }
 695
 696         /**
 697          * Default implementation of wikitext highlighting
 698          *
 699          * @param $text String
 700          * @param $terms Array: terms to highlight (unescaped)
 701          * @param $contextlines Integer
 702          * @param $contextchars Integer
 703          * @return String
 704          */
 705         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 706                 global $wgLang, $wgContLang;
 707                 global $wgSearchHighlightBoundaries;
 708                 $fname = __METHOD__;
 709
 710                 if($text == '')
 711                         return '';
 712
 713                 // spli text into text + templates/links/tables
 714                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 715                 // first capture group is for detecting nested templates/links/tables/references
 716                 $endPatterns = array(
 717                         1 => '/(\{\{)|(\}\})/', // template
 718                         2 => '/(\[\[)|(\]\])/', // image
 719                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 720
 721                 // FIXME: this should prolly be a hook or something
 722                 if(function_exists('wfCite')){
 723                         $spat .= '|(<ref>)'; // references via cite extension
 724                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 725                 }
 726                 $spat .= '/';
 727                 $textExt = array(); // text extracts
 728                 $otherExt = array();  // other extracts
 729                 wfProfileIn( "$fname-split" );
 730                 $start = 0;
 731                 $textLen = strlen($text);
 732                 $count = 0; // sequence number to maintain ordering
 733                 while( $start < $textLen ){
 734                         // find start of template/image/table
 735                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 736                                 $epat = '';
 737                                 foreach($matches as $key => $val){
 738                                         if($key > 0 && $val[1] != -1){
 739                                                 if($key == 2){
 740                                                         // see if this is an image link
 741                                                         $ns = substr($val[0],2,-1);
 742                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 743                                                                 break;
 744
 745                                                 }
 746                                                 $epat = $endPatterns[$key];
 747                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 748                                                 $start = $val[1];
 749                                                 break;
 750                                         }
 751                                 }
 752                                 if( $epat ){
 753                                         // find end (and detect any nested elements)
 754                                         $level = 0;
 755                                         $offset = $start + 1;
 756                                         $found = false;
 757                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 758                                                 if( array_key_exists(2,$endMatches) ){
 759                                                         // found end
 760                                                         if($level == 0){
 761                                                                 $len = strlen($endMatches[2][0]);
 762                                                                 $off = $endMatches[2][1];
 763                                                                 $this->splitAndAdd( $otherExt, $count,
 764                                                                         substr( $text, $start, $off + $len  - $start ) );
 765                                                                 $start = $off + $len;
 766                                                                 $found = true;
 767                                                                 break;
 768                                                         } else{
 769                                                                 // end of nested element
 770                                                                 $level -= 1;
 771                                                         }
 772                                                 } else{
 773                                                         // nested
 774                                                         $level += 1;
 775                                                 }
 776                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 777                                         }
 778                                         if( ! $found ){
 779                                                 // couldn't find appropriate closing tag, skip
 780                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 781                                                 $start += strlen($matches[0][0]);
 782                                         }
 783                                         continue;
 784                                 }
 785                         }
 786                         // else: add as text extract
 787                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 788                         break;
 789                 }
 790
 791                 $all = $textExt + $otherExt; // these have disjunct key sets
 792
 793                 wfProfileOut( "$fname-split" );
 794
 795                 // prepare regexps
 796                 foreach( $terms as $index => $term ) {
 797                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 798                         if(preg_match('/[\x80-\xff]/', $term) ){
 799                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 800                         } else {
 801                                 $terms[$index] = $term;
 802                         }
 803                 }
 804                 $anyterm = implode( '|', $terms );
 805                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 806
 807                 // FIXME: a hack to scale contextchars, a correct solution
 808                 // would be to have contextchars actually be char and not byte
 809                 // length, and do proper utf-8 substrings and lengths everywhere,
 810                 // but PHP is making that very hard and unclean to implement :(
 811                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 812                 $contextchars = intval( $contextchars * $scale );
 813
 814                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 815                 $patPost = "($wgSearchHighlightBoundaries|$)";
 816
 817                 $pat1 = "/(".$phrase.")/ui";
 818                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 819
 820                 wfProfileIn( "$fname-extract" );
 821
 822                 $left = $contextlines;
 823
 824                 $snippets = array();
 825                 $offsets = array();
 826
 827                 // show beginning only if it contains all words
 828                 $first = 0;
 829                 $firstText = '';
 830                 foreach($textExt as $index => $line){
 831                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 832                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 833                                 $first = $index;
 834                                 break;
 835                         }
 836                 }
 837                 if( $firstText ){
 838                         $succ = true;
 839                         // check if first text contains all terms
 840                         foreach($terms as $term){
 841                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 842                                         $succ = false;
 843                                         break;
 844                                 }
 845                         }
 846                         if( $succ ){
 847                                 $snippets[$first] = $firstText;
 848                                 $offsets[$first] = 0;
 849                         }
 850                 }
 851                 if( ! $snippets ) {
 852                         // match whole query on text
 853                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 854                         // match whole query on templates/tables/images
 855                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 856                         // match any words on text
 857                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 858                         // match any words on templates/tables/images
 859                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 860
 861                         ksort($snippets);
 862                 }
 863
 864                 // add extra chars to each snippet to make snippets constant size
 865                 $extended = array();
 866                 if( count( $snippets ) == 0){
 867                         // couldn't find the target words, just show beginning of article
 868                         $targetchars = $contextchars * $contextlines;
 869                         $snippets[$first] = '';
 870                         $offsets[$first] = 0;
 871                 } else{
 872                         // if begin of the article contains the whole phrase, show only that !!
 873                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 874                             && $offsets[$first] < $contextchars * 2 ){
 875                                 $snippets = array ($first => $snippets[$first]);
 876                         }
 877
 878                         // calc by how much to extend existing snippets
 879                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 880                 }
 881
 882                 foreach($snippets as $index => $line){
 883                         $extended[$index] = $line;
 884                         $len = strlen($line);
 885                         if( $len < $targetchars - 20 ){
 886                                 // complete this line
 887                                 if($len < strlen( $all[$index] )){
 888                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 889                                         $len = strlen( $extended[$index] );
 890                                 }
 891
 892                                 // add more lines
 893                                 $add = $index + 1;
 894                                 while( $len < $targetchars - 20
 895                                        && array_key_exists($add,$all)
 896                                        && !array_key_exists($add,$snippets) ){
 897                                     $offsets[$add] = 0;
 898                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 899                                         $extended[$add] = $tt;
 900                                         $len += strlen( $tt );
 901                                         $add++;
 902                                 }
 903                         }
 904                 }
 905
 906                 //$snippets = array_map('htmlspecialchars', $extended);
 907                 $snippets = $extended;
 908                 $last = -1;
 909                 $extract = '';
 910                 foreach($snippets as $index => $line){
 911                         if($last == -1)
 912                                 $extract .= $line; // first line
 913                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 914                                 $extract .= " ".$line; // continous lines
 915                         else
 916                                 $extract .= '<b> ... </b>' . $line;
 917
 918                         $last = $index;
 919                 }
 920                 if( $extract )
 921                         $extract .= '<b> ... </b>';
 922
 923                 $processed = array();
 924                 foreach($terms as $term){
 925                         if( ! isset($processed[$term]) ){
 926                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 927                                 $extract = preg_replace( $pat3,
 928                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 929                                 $processed[$term] = true;
 930                         }
 931                 }
 932
 933                 wfProfileOut( "$fname-extract" );
 934
 935                 return $extract;
 936         }
 937
 938         /**
 939          * Split text into lines and add it to extracts array
 940          *
 941          * @param $extracts Array: index -> $line
 942          * @param $count Integer
 943          * @param $text String
 944          */
 945         function splitAndAdd(&$extracts, &$count, $text){
 946                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 947                 foreach($split as $line){
 948                         $tt = trim($line);
 949                         if( $tt )
 950                                 $extracts[$count++] = $tt;
 951                 }
 952         }
 953
 954         /**
 955          * Do manual case conversion for non-ascii chars
 956          *
 957          * @param $matches Array
 958          */
 959         function caseCallback($matches){
 960                 global $wgContLang;
 961                 if( strlen($matches[0]) > 1 ){
 962                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 963                 } else
 964                         return $matches[0];
 965         }
 966
 967         /**
 968          * Extract part of the text from start to end, but by
 969          * not chopping up words
 970          * @param $text String
 971          * @param $start Integer
 972          * @param $end Integer
 973          * @param $posStart Integer: (out) actual start position
 974          * @param $posEnd Integer: (out) actual end position
 975          * @return String
 976          */
 977         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 978                 global $wgContLang;
 979
 980                 if( $start != 0)
 981                         $start = $this->position( $text, $start, 1 );
 982                 if( $end >= strlen($text) )
 983                         $end = strlen($text);
 984                 else
 985                         $end = $this->position( $text, $end );
 986
 987                 if(!is_null($posStart))
 988                         $posStart = $start;
 989                 if(!is_null($posEnd))
 990                         $posEnd = $end;
 991
 992                 if($end > $start)
 993                         return substr($text, $start, $end-$start);
 994                 else
 995                         return '';
 996         }
 997
 998         /**
 999          * Find a nonletter near a point (index) in the text
1000          *
1001          * @param $text String
1002          * @param $point Integer
1003          * @param $offset Integer: offset to found index
1004          * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1005          */
1006         function position($text, $point, $offset=0 ){
1007                 $tolerance = 10;
1008                 $s = max( 0, $point - $tolerance );
1009                 $l = min( strlen($text), $point + $tolerance ) - $s;
1010                 $m = array();
1011                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1012                         return $m[0][1] + $s + $offset;
1013                 } else{
1014                         // check if point is on a valid first UTF8 char
1015                         $char = ord( $text[$point] );
1016                         while( $char >= 0x80 && $char < 0xc0 ) {
1017                                 // skip trailing bytes
1018                                 $point++;
1019                                 if($point >= strlen($text))
1020                                         return strlen($text);
1021                                 $char = ord( $text[$point] );
1022                         }
1023                         return $point;
1024
1025                 }
1026         }
1027
1028         /**
1029          * Search extracts for a pattern, and return snippets
1030          *
1031          * @param $pattern String: regexp for matching lines
1032          * @param $extracts Array: extracts to search
1033          * @param $linesleft Integer: number of extracts to make
1034          * @param $contextchars Integer: length of snippet
1035          * @param $out Array: map for highlighted snippets
1036          * @param $offsets Array: map of starting points of snippets
1037          * @protected
1038          */
1039         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1040                 if($linesleft == 0)
1041                         return; // nothing to do
1042                 foreach($extracts as $index => $line){
1043                         if( array_key_exists($index,$out) )
1044                                 continue; // this line already highlighted
1045
1046                         $m = array();
1047                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1048                                 continue;
1049
1050                         $offset = $m[0][1];
1051                         $len = strlen($m[0][0]);
1052                         if($offset + $len < $contextchars)
1053                                 $begin = 0;
1054                         elseif( $len > $contextchars)
1055                                 $begin = $offset;
1056                         else
1057                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1058
1059                         $end = $begin + $contextchars;
1060
1061                         $posBegin = $begin;
1062                         // basic snippet from this line
1063                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1064                         $offsets[$index] = $posBegin;
1065                         $linesleft--;
1066                         if($linesleft == 0)
1067                                 return;
1068                 }
1069         }
1070
1071         /**
1072          * Basic wikitext removal
1073          * @protected
1074          */
1075         function removeWiki($text) {
1076                 $fname = __METHOD__;
1077                 wfProfileIn( $fname );
1078
1079                 //$text = preg_replace("/'{2,5}/", "", $text);
1080                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1081                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1082                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1083                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1084                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1085                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1086                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1087                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1088                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1089                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1090                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1091                 $text = preg_replace("/'''''/", "", $text);
1092                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1093                 $text = preg_replace("/''/", "", $text);
1094
1095                 wfProfileOut( $fname );
1096                 return $text;
1097         }
1098
1099         /**
1100          * callback to replace [[target|caption]] kind of links, if
1101          * the target is category or image, leave it
1102          *
1103          * @param $matches Array
1104          */
1105         function linkReplace($matches){
1106                 $colon = strpos( $matches[1], ':' );
1107                 if( $colon === false )
1108                         return $matches[2]; // replace with caption
1109                 global $wgContLang;
1110                 $ns = substr( $matches[1], 0, $colon );
1111                 $index = $wgContLang->getNsIndex($ns);
1112                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1113                         return $matches[0]; // return the whole thing
1114                 else
1115                         return $matches[2];
1116
1117         }
1118
1119         /**
1120      * Simple & fast snippet extraction, but gives completely unrelevant
1121      * snippets
1122      *
1123      * @param $text String
1124      * @param $terms Array
1125      * @param $contextlines Integer
1126      * @param $contextchars Integer
1127      * @return String
1128      */
1129     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1130         global $wgLang, $wgContLang;
1131         $fname = __METHOD__;
1132
1133         $lines = explode( "\n", $text );
1134
1135         $terms = implode( '|', $terms );
1136         $max = intval( $contextchars ) + 1;
1137         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1138
1139         $lineno = 0;
1140
1141         $extract = "";
1142         wfProfileIn( "$fname-extract" );
1143         foreach ( $lines as $line ) {
1144             if ( 0 == $contextlines ) {
1145                 break;
1146             }
1147             ++$lineno;
1148             $m = array();
1149             if ( ! preg_match( $pat1, $line, $m ) ) {
1150                 continue;
1151             }
1152             --$contextlines;
1153             $pre = $wgContLang->truncate( $m[1], -$contextchars );
1154
1155             if ( count( $m ) < 3 ) {
1156                 $post = '';
1157             } else {
1158                 $post = $wgContLang->truncate( $m[3], $contextchars );
1159             }
1160
1161             $found = $m[2];
1162
1163             $line = htmlspecialchars( $pre . $found . $post );
1164             $pat2 = '/(' . $terms . ")/i";
1165             $line = preg_replace( $pat2,
1166               "<span class='searchmatch'>\\1</span>", $line );
1167
1168             $extract .= "${line}\n";
1169         }
1170         wfProfileOut( "$fname-extract" );
1171
1172         return $extract;
1173     }
1174
1175 }
1176
1177 /**
1178  * Dummy class to be used when non-supported Database engine is present.
1179  * @todo Fixme: dummy class should probably try something at least mildly useful,
1180  * such as a LIKE search through titles.
1181  * @ingroup Search
1182  */
1183 class SearchEngineDummy extends SearchEngine {
1184         // no-op
1185 }