includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          *
  25          * @param string $term - Raw search term
  26          * @return SearchResultSet
  27          * @access public
  28          * @abstract
  29          */
  30         function searchText( $term ) {
  31                 return null;
  32         }
  33
  34         /**
  35          * Perform a title-only search query and return a result set.
  36          * If title searches are not supported or disabled, return null.
  37          *
  38          * @param string $term - Raw search term
  39          * @return SearchResultSet
  40          * @access public
  41          * @abstract
  42          */
  43         function searchTitle( $term ) {
  44                 return null;
  45         }
  46
  47         /** If this search backend can list/unlist redirects */
  48         function acceptListRedirects() {
  49                 return true;
  50         }
  51
  52         /**
  53          * Transform search term in cases when parts of the query came as different GET params (when supported)
  54          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  55          */
  56         function transformSearchTerm( $term ) {
  57                 return $term;
  58         }
  59
  60         /**
  61          * If an exact title match can be find, or a very slightly close match,
  62          * return the title. If no match, returns NULL.
  63          *
  64          * @param string $term
  65          * @return Title
  66          */
  67         public static function getNearMatch( $searchterm ) {
  68                 global $wgContLang;
  69
  70                 $allSearchTerms = array($searchterm);
  71
  72                 if($wgContLang->hasVariants()){
  73                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  74                 }
  75
  76                 foreach($allSearchTerms as $term){
  77
  78                         # Exact match? No need to look further.
  79                         $title = Title::newFromText( $term );
  80                         if (is_null($title))
  81                                 return NULL;
  82
  83                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal()
  84                              || $title->exists() ) {
  85                                 return $title;
  86                         }
  87
  88                         # Now try all lower case (i.e. first letter capitalized)
  89                         #
  90                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  91                         if ( $title && $title->exists() ) {
  92                                 return $title;
  93                         }
  94
  95                         # Now try capitalized string
  96                         #
  97                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  98                         if ( $title && $title->exists() ) {
  99                                 return $title;
 100                         }
 101
 102                         # Now try all upper case
 103                         #
 104                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 105                         if ( $title && $title->exists() ) {
 106                                 return $title;
 107                         }
 108
 109                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 110                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 111                         if ( $title && $title->exists() ) {
 112                                 return $title;
 113                         }
 114
 115                         // Give hooks a chance at better match variants
 116                         $title = null;
 117                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 118                                 return $title;
 119                         }
 120                 }
 121
 122                 $title = Title::newFromText( $searchterm );
 123
 124                 # Entering an IP address goes to the contributions page
 125                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 126                         || User::isIP( trim( $searchterm ) ) ) {
 127                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 128                 }
 129
 130
 131                 # Entering a user goes to the user page whether it's there or not
 132                 if ( $title->getNamespace() == NS_USER ) {
 133                         return $title;
 134                 }
 135
 136                 # Go to images that exist even if there's no local page.
 137                 # There may have been a funny upload, or it may be on a shared
 138                 # file repository such as Wikimedia Commons.
 139                 if( $title->getNamespace() == NS_FILE ) {
 140                         $image = wfFindFile( $title );
 141                         if( $image ) {
 142                                 return $title;
 143                         }
 144                 }
 145
 146                 # MediaWiki namespace? Page may be "implied" if not customized.
 147                 # Just return it, with caps forced as the message system likes it.
 148                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 149                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 150                 }
 151
 152                 # Quoted term? Try without the quotes...
 153                 $matches = array();
 154                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 155                         return SearchEngine::getNearMatch( $matches[1] );
 156                 }
 157
 158                 return NULL;
 159         }
 160
 161         public static function legalSearchChars() {
 162                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 163         }
 164
 165         /**
 166          * Set the maximum number of results to return
 167          * and how many to skip before returning the first.
 168          *
 169          * @param int $limit
 170          * @param int $offset
 171          * @access public
 172          */
 173         function setLimitOffset( $limit, $offset = 0 ) {
 174                 $this->limit = intval( $limit );
 175                 $this->offset = intval( $offset );
 176         }
 177
 178         /**
 179          * Set which namespaces the search should include.
 180          * Give an array of namespace index numbers.
 181          *
 182          * @param array $namespaces
 183          * @access public
 184          */
 185         function setNamespaces( $namespaces ) {
 186                 $this->namespaces = $namespaces;
 187         }
 188
 189         /**
 190          * Parse some common prefixes: all (search everything)
 191          * or namespace names
 192          *
 193          * @param string $query
 194          */
 195         function replacePrefixes( $query ){
 196                 global $wgContLang;
 197
 198                 if( strpos($query,':') === false )
 199                         return $query; // nothing to do
 200
 201                 $parsed = $query;
 202                 $allkeyword = wfMsgForContent('searchall').":";
 203                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 204                         $this->namespaces = null;
 205                         $parsed = substr($query,strlen($allkeyword));
 206                 } else if( strpos($query,':') !== false ) {
 207                         $prefix = substr($query,0,strpos($query,':'));
 208                         $index = $wgContLang->getNsIndex($prefix);
 209                         if($index !== false){
 210                                 $this->namespaces = array($index);
 211                                 $parsed = substr($query,strlen($prefix)+1);
 212                         }
 213                 }
 214                 if(trim($parsed) == '')
 215                         return $query; // prefix was the whole query
 216
 217                 return $parsed;
 218         }
 219
 220         /**
 221          * Make a list of searchable namespaces and their canonical names.
 222          * @return array
 223          */
 224         public static function searchableNamespaces() {
 225                 global $wgContLang;
 226                 $arr = array();
 227                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 228                         if( $ns >= NS_MAIN ) {
 229                                 $arr[$ns] = $name;
 230                         }
 231                 }
 232                 return $arr;
 233         }
 234
 235         /**
 236          * Extract default namespaces to search from the given user's
 237          * settings, returning a list of index numbers.
 238          *
 239          * @param User $user
 240          * @return array
 241          * @static
 242          */
 243         public static function userNamespaces( &$user ) {
 244                 $arr = array();
 245                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 246                         if( $user->getOption( 'searchNs' . $ns ) ) {
 247                                 $arr[] = $ns;
 248                         }
 249                 }
 250                 return $arr;
 251         }
 252
 253         /**
 254          * Find snippet highlight settings for a given user
 255          *
 256          * @param User $user
 257          * @return array contextlines, contextchars
 258          * @static
 259          */
 260         public static function userHighlightPrefs( &$user ){
 261                 //$contextlines = $user->getOption( 'contextlines',  5 );
 262                 //$contextchars = $user->getOption( 'contextchars', 50 );
 263                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 264                 $contextchars = 75; // same as above.... :P
 265                 return array($contextlines, $contextchars);
 266         }
 267
 268         /**
 269          * An array of namespaces indexes to be searched by default
 270          *
 271          * @return array
 272          * @static
 273          */
 274         public static function defaultNamespaces(){
 275                 global $wgNamespacesToBeSearchedDefault;
 276
 277                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 278         }
 279
 280         /**
 281          * Get a list of namespace names useful for showing in tooltips
 282          * and preferences
 283          *
 284          * @param unknown_type $namespaces
 285          */
 286         public static function namespacesAsText( $namespaces ){
 287                 global $wgContLang;
 288
 289                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 290                 foreach( $formatted as $key => $ns ){
 291                         if ( empty($ns) )
 292                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 293                 }
 294                 return $formatted;
 295         }
 296
 297         /**
 298          * An array of "project" namespaces indexes typically searched
 299          * by logged-in users
 300          *
 301          * @return array
 302          * @static
 303          */
 304         public static function projectNamespaces() {
 305                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 306
 307                 return array_keys( $wgNamespacesToBeSearchedProject, true );
 308         }
 309
 310         /**
 311          * An array of "project" namespaces indexes typically searched
 312          * by logged-in users in addition to the default namespaces
 313          *
 314          * @return array
 315          * @static
 316          */
 317         public static function defaultAndProjectNamespaces() {
 318                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 319
 320                 return array_keys( $wgNamespacesToBeSearchedDefault +
 321                         $wgNamespacesToBeSearchedProject, true);
 322         }
 323
 324         /**
 325          * Return a 'cleaned up' search string
 326          *
 327          * @return string
 328          * @access public
 329          */
 330         function filter( $text ) {
 331                 $lc = $this->legalSearchChars();
 332                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 333         }
 334         /**
 335          * Load up the appropriate search engine class for the currently
 336          * active database backend, and return a configured instance.
 337          *
 338          * @return SearchEngine
 339          */
 340         public static function create() {
 341                 global $wgSearchType;
 342                 $dbr = wfGetDB( DB_SLAVE );
 343                 if( $wgSearchType ) {
 344                         $class = $wgSearchType;
 345                 } else {
 346                         $class = $dbr->getSearchEngine();
 347                 }
 348                 $search = new $class( $dbr );
 349                 $search->setLimitOffset(0,0);
 350                 return $search;
 351         }
 352
 353         /**
 354          * Create or update the search index record for the given page.
 355          * Title and text should be pre-processed.
 356          *
 357          * @param int $id
 358          * @param string $title
 359          * @param string $text
 360          * @abstract
 361          */
 362         function update( $id, $title, $text ) {
 363                 // no-op
 364         }
 365
 366         /**
 367          * Update a search index record's title only.
 368          * Title should be pre-processed.
 369          *
 370          * @param int $id
 371          * @param string $title
 372          * @abstract
 373          */
 374         function updateTitle( $id, $title ) {
 375                 // no-op
 376         }
 377
 378         /**
 379          * Get OpenSearch suggestion template
 380          *
 381          * @return string
 382          * @static
 383          */
 384         public static function getOpenSearchTemplate() {
 385                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 386                 if( $wgOpenSearchTemplate )     {
 387                         return $wgOpenSearchTemplate;
 388                 } else {
 389                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 390                         if( !$ns ) $ns = "0";
 391                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 392                 }
 393         }
 394
 395         /**
 396          * Get internal MediaWiki Suggest template
 397          *
 398          * @return string
 399          * @static
 400          */
 401         public static function getMWSuggestTemplate() {
 402                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 403                 if($wgMWSuggestTemplate)
 404                         return $wgMWSuggestTemplate;
 405                 else
 406                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 407         }
 408 }
 409
 410 /**
 411  * @ingroup Search
 412  */
 413 class SearchResultSet {
 414         /**
 415          * Fetch an array of regular expression fragments for matching
 416          * the search terms as parsed by this engine in a text extract.
 417          *
 418          * @return array
 419          * @access public
 420          * @abstract
 421          */
 422         function termMatches() {
 423                 return array();
 424         }
 425
 426         function numRows() {
 427                 return 0;
 428         }
 429
 430         /**
 431          * Return true if results are included in this result set.
 432          * @return bool
 433          * @abstract
 434          */
 435         function hasResults() {
 436                 return false;
 437         }
 438
 439         /**
 440          * Some search modes return a total hit count for the query
 441          * in the entire article database. This may include pages
 442          * in namespaces that would not be matched on the given
 443          * settings.
 444          *
 445          * Return null if no total hits number is supported.
 446          *
 447          * @return int
 448          * @access public
 449          */
 450         function getTotalHits() {
 451                 return null;
 452         }
 453
 454         /**
 455          * Some search modes return a suggested alternate term if there are
 456          * no exact hits. Returns true if there is one on this set.
 457          *
 458          * @return bool
 459          * @access public
 460          */
 461         function hasSuggestion() {
 462                 return false;
 463         }
 464
 465         /**
 466          * @return string suggested query, null if none
 467          */
 468         function getSuggestionQuery(){
 469                 return null;
 470         }
 471
 472         /**
 473          * @return string HTML highlighted suggested query, '' if none
 474          */
 475         function getSuggestionSnippet(){
 476                 return '';
 477         }
 478
 479         /**
 480          * Return information about how and from where the results were fetched,
 481          * should be useful for diagnostics and debugging
 482          *
 483          * @return string
 484          */
 485         function getInfo() {
 486                 return null;
 487         }
 488
 489         /**
 490          * Return a result set of hits on other (multiple) wikis associated with this one
 491          *
 492          * @return SearchResultSet
 493          */
 494         function getInterwikiResults() {
 495                 return null;
 496         }
 497
 498         /**
 499          * Check if there are results on other wikis
 500          *
 501          * @return boolean
 502          */
 503         function hasInterwikiResults() {
 504                 return $this->getInterwikiResults() != null;
 505         }
 506
 507
 508         /**
 509          * Fetches next search result, or false.
 510          * @return SearchResult
 511          * @access public
 512          * @abstract
 513          */
 514         function next() {
 515                 return false;
 516         }
 517
 518         /**
 519          * Frees the result set, if applicable.
 520          * @ access public
 521          */
 522         function free() {
 523                 // ...
 524         }
 525 }
 526
 527
 528 /**
 529  * @ingroup Search
 530  */
 531 class SearchResultTooMany {
 532         ## Some search engines may bail out if too many matches are found
 533 }
 534
 535
 536 /**
 537  * @fixme This class is horribly factored. It would probably be better to have
 538  * a useful base class to which you pass some standard information, then let
 539  * the fancy self-highlighters extend that.
 540  * @ingroup Search
 541  */
 542 class SearchResult {
 543         var $mRevision = null;
 544         var $mImage = null;
 545
 546         function __construct( $row ) {
 547                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 548                 if( !is_null($this->mTitle) ){
 549                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 550                         if( $this->mTitle->getNamespace() === NS_FILE )
 551                                 $this->mImage = wfFindFile( $this->mTitle );
 552                 }
 553         }
 554
 555         /**
 556          * Check if this is result points to an invalid title
 557          *
 558          * @return boolean
 559          * @access public
 560          */
 561         function isBrokenTitle(){
 562                 if( is_null($this->mTitle) )
 563                         return true;
 564                 return false;
 565         }
 566
 567         /**
 568          * Check if target page is missing, happens when index is out of date
 569          *
 570          * @return boolean
 571          * @access public
 572          */
 573         function isMissingRevision(){
 574                 return !$this->mRevision && !$this->mImage;
 575         }
 576
 577         /**
 578          * @return Title
 579          * @access public
 580          */
 581         function getTitle() {
 582                 return $this->mTitle;
 583         }
 584
 585         /**
 586          * @return double or null if not supported
 587          */
 588         function getScore() {
 589                 return null;
 590         }
 591
 592         /**
 593          * Lazy initialization of article text from DB
 594          */
 595         protected function initText(){
 596                 if( !isset($this->mText) ){
 597                         if($this->mRevision != null)
 598                                 $this->mText = $this->mRevision->getText();
 599                         else // TODO: can we fetch raw wikitext for commons images?
 600                                 $this->mText = '';
 601
 602                 }
 603         }
 604
 605         /**
 606          * @param array $terms terms to highlight
 607          * @return string highlighted text snippet, null (and not '') if not supported
 608          */
 609         function getTextSnippet($terms){
 610                 global $wgUser, $wgAdvancedSearchHighlighting;
 611                 $this->initText();
 612                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 613                 $h = new SearchHighlighter();
 614                 if( $wgAdvancedSearchHighlighting )
 615                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 616                 else
 617                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 618         }
 619
 620         /**
 621          * @param array $terms terms to highlight
 622          * @return string highlighted title, '' if not supported
 623          */
 624         function getTitleSnippet($terms){
 625                 return '';
 626         }
 627
 628         /**
 629          * @param array $terms terms to highlight
 630          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 631          */
 632         function getRedirectSnippet($terms){
 633                 return '';
 634         }
 635
 636         /**
 637          * @return Title object for the redirect to this page, null if none or not supported
 638          */
 639         function getRedirectTitle(){
 640                 return null;
 641         }
 642
 643         /**
 644          * @return string highlighted relevant section name, null if none or not supported
 645          */
 646         function getSectionSnippet(){
 647                 return '';
 648         }
 649
 650         /**
 651          * @return Title object (pagename+fragment) for the section, null if none or not supported
 652          */
 653         function getSectionTitle(){
 654                 return null;
 655         }
 656
 657         /**
 658          * @return string timestamp
 659          */
 660         function getTimestamp(){
 661                 if( $this->mRevision )
 662                         return $this->mRevision->getTimestamp();
 663                 else if( $this->mImage )
 664                         return $this->mImage->getTimestamp();
 665                 return '';
 666         }
 667
 668         /**
 669          * @return int number of words
 670          */
 671         function getWordCount(){
 672                 $this->initText();
 673                 return str_word_count( $this->mText );
 674         }
 675
 676         /**
 677          * @return int size in bytes
 678          */
 679         function getByteSize(){
 680                 $this->initText();
 681                 return strlen( $this->mText );
 682         }
 683
 684         /**
 685          * @return boolean if hit has related articles
 686          */
 687         function hasRelated(){
 688                 return false;
 689         }
 690
 691         /**
 692          * @return interwiki prefix of the title (return iw even if title is broken)
 693          */
 694         function getInterwikiPrefix(){
 695                 return '';
 696         }
 697 }
 698
 699 /**
 700  * Highlight bits of wikitext
 701  *
 702  * @ingroup Search
 703  */
 704 class SearchHighlighter {
 705         var $mCleanWikitext = true;
 706
 707         function SearchHighlighter($cleanupWikitext = true){
 708                 $this->mCleanWikitext = $cleanupWikitext;
 709         }
 710
 711         /**
 712          * Default implementation of wikitext highlighting
 713          *
 714          * @param string $text
 715          * @param array $terms Terms to highlight (unescaped)
 716          * @param int $contextlines
 717          * @param int $contextchars
 718          * @return string
 719          */
 720         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 721                 global $wgLang, $wgContLang;
 722                 global $wgSearchHighlightBoundaries;
 723                 $fname = __METHOD__;
 724
 725                 if($text == '')
 726                         return '';
 727
 728                 // spli text into text + templates/links/tables
 729                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 730                 // first capture group is for detecting nested templates/links/tables/references
 731                 $endPatterns = array(
 732                         1 => '/(\{\{)|(\}\})/', // template
 733                         2 => '/(\[\[)|(\]\])/', // image
 734                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 735
 736                 // FIXME: this should prolly be a hook or something
 737                 if(function_exists('wfCite')){
 738                         $spat .= '|(<ref>)'; // references via cite extension
 739                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 740                 }
 741                 $spat .= '/';
 742                 $textExt = array(); // text extracts
 743                 $otherExt = array();  // other extracts
 744                 wfProfileIn( "$fname-split" );
 745                 $start = 0;
 746                 $textLen = strlen($text);
 747                 $count = 0; // sequence number to maintain ordering
 748                 while( $start < $textLen ){
 749                         // find start of template/image/table
 750                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 751                                 $epat = '';
 752                                 foreach($matches as $key => $val){
 753                                         if($key > 0 && $val[1] != -1){
 754                                                 if($key == 2){
 755                                                         // see if this is an image link
 756                                                         $ns = substr($val[0],2,-1);
 757                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 758                                                                 break;
 759
 760                                                 }
 761                                                 $epat = $endPatterns[$key];
 762                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 763                                                 $start = $val[1];
 764                                                 break;
 765                                         }
 766                                 }
 767                                 if( $epat ){
 768                                         // find end (and detect any nested elements)
 769                                         $level = 0;
 770                                         $offset = $start + 1;
 771                                         $found = false;
 772                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 773                                                 if( array_key_exists(2,$endMatches) ){
 774                                                         // found end
 775                                                         if($level == 0){
 776                                                                 $len = strlen($endMatches[2][0]);
 777                                                                 $off = $endMatches[2][1];
 778                                                                 $this->splitAndAdd( $otherExt, $count,
 779                                                                         substr( $text, $start, $off + $len  - $start ) );
 780                                                                 $start = $off + $len;
 781                                                                 $found = true;
 782                                                                 break;
 783                                                         } else{
 784                                                                 // end of nested element
 785                                                                 $level -= 1;
 786                                                         }
 787                                                 } else{
 788                                                         // nested
 789                                                         $level += 1;
 790                                                 }
 791                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 792                                         }
 793                                         if( ! $found ){
 794                                                 // couldn't find appropriate closing tag, skip
 795                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 796                                                 $start += strlen($matches[0][0]);
 797                                         }
 798                                         continue;
 799                                 }
 800                         }
 801                         // else: add as text extract
 802                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 803                         break;
 804                 }
 805
 806                 $all = $textExt + $otherExt; // these have disjunct key sets
 807
 808                 wfProfileOut( "$fname-split" );
 809
 810                 // prepare regexps
 811                 foreach( $terms as $index => $term ) {
 812                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 813                         if(preg_match('/[\x80-\xff]/', $term) ){
 814                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 815                         } else {
 816                                 $terms[$index] = $term;
 817                         }
 818                 }
 819                 $anyterm = implode( '|', $terms );
 820                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 821
 822                 // FIXME: a hack to scale contextchars, a correct solution
 823                 // would be to have contextchars actually be char and not byte
 824                 // length, and do proper utf-8 substrings and lengths everywhere,
 825                 // but PHP is making that very hard and unclean to implement :(
 826                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 827                 $contextchars = intval( $contextchars * $scale );
 828
 829                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 830                 $patPost = "($wgSearchHighlightBoundaries|$)";
 831
 832                 $pat1 = "/(".$phrase.")/ui";
 833                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 834
 835                 wfProfileIn( "$fname-extract" );
 836
 837                 $left = $contextlines;
 838
 839                 $snippets = array();
 840                 $offsets = array();
 841
 842                 // show beginning only if it contains all words
 843                 $first = 0;
 844                 $firstText = '';
 845                 foreach($textExt as $index => $line){
 846                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 847                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 848                                 $first = $index;
 849                                 break;
 850                         }
 851                 }
 852                 if( $firstText ){
 853                         $succ = true;
 854                         // check if first text contains all terms
 855                         foreach($terms as $term){
 856                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 857                                         $succ = false;
 858                                         break;
 859                                 }
 860                         }
 861                         if( $succ ){
 862                                 $snippets[$first] = $firstText;
 863                                 $offsets[$first] = 0;
 864                         }
 865                 }
 866                 if( ! $snippets ) {
 867                         // match whole query on text
 868                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 869                         // match whole query on templates/tables/images
 870                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 871                         // match any words on text
 872                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 873                         // match any words on templates/tables/images
 874                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 875
 876                         ksort($snippets);
 877                 }
 878
 879                 // add extra chars to each snippet to make snippets constant size
 880                 $extended = array();
 881                 if( count( $snippets ) == 0){
 882                         // couldn't find the target words, just show beginning of article
 883                         $targetchars = $contextchars * $contextlines;
 884                         $snippets[$first] = '';
 885                         $offsets[$first] = 0;
 886                 } else{
 887                         // if begin of the article contains the whole phrase, show only that !!
 888                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 889                             && $offsets[$first] < $contextchars * 2 ){
 890                                 $snippets = array ($first => $snippets[$first]);
 891                         }
 892
 893                         // calc by how much to extend existing snippets
 894                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 895                 }
 896
 897                 foreach($snippets as $index => $line){
 898                         $extended[$index] = $line;
 899                         $len = strlen($line);
 900                         if( $len < $targetchars - 20 ){
 901                                 // complete this line
 902                                 if($len < strlen( $all[$index] )){
 903                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 904                                         $len = strlen( $extended[$index] );
 905                                 }
 906
 907                                 // add more lines
 908                                 $add = $index + 1;
 909                                 while( $len < $targetchars - 20
 910                                        && array_key_exists($add,$all)
 911                                        && !array_key_exists($add,$snippets) ){
 912                                     $offsets[$add] = 0;
 913                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 914                                         $extended[$add] = $tt;
 915                                         $len += strlen( $tt );
 916                                         $add++;
 917                                 }
 918                         }
 919                 }
 920
 921                 //$snippets = array_map('htmlspecialchars', $extended);
 922                 $snippets = $extended;
 923                 $last = -1;
 924                 $extract = '';
 925                 foreach($snippets as $index => $line){
 926                         if($last == -1)
 927                                 $extract .= $line; // first line
 928                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 929                                 $extract .= " ".$line; // continous lines
 930                         else
 931                                 $extract .= '<b> ... </b>' . $line;
 932
 933                         $last = $index;
 934                 }
 935                 if( $extract )
 936                         $extract .= '<b> ... </b>';
 937
 938                 $processed = array();
 939                 foreach($terms as $term){
 940                         if( ! isset($processed[$term]) ){
 941                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 942                                 $extract = preg_replace( $pat3,
 943                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 944                                 $processed[$term] = true;
 945                         }
 946                 }
 947
 948                 wfProfileOut( "$fname-extract" );
 949
 950                 return $extract;
 951         }
 952
 953         /**
 954          * Split text into lines and add it to extracts array
 955          *
 956          * @param array $extracts index -> $line
 957          * @param int $count
 958          * @param string $text
 959          */
 960         function splitAndAdd(&$extracts, &$count, $text){
 961                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 962                 foreach($split as $line){
 963                         $tt = trim($line);
 964                         if( $tt )
 965                                 $extracts[$count++] = $tt;
 966                 }
 967         }
 968
 969         /**
 970          * Do manual case conversion for non-ascii chars
 971          *
 972          * @param unknown_type $matches
 973          */
 974         function caseCallback($matches){
 975                 global $wgContLang;
 976                 if( strlen($matches[0]) > 1 ){
 977                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 978                 } else
 979                         return $matches[0];
 980         }
 981
 982         /**
 983          * Extract part of the text from start to end, but by
 984          * not chopping up words
 985          * @param string $text
 986          * @param int $start
 987          * @param int $end
 988          * @param int $posStart (out) actual start position
 989          * @param int $posEnd (out) actual end position
 990          * @return string
 991          */
 992         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 993                 global $wgContLang;
 994
 995                 if( $start != 0)
 996                         $start = $this->position( $text, $start, 1 );
 997                 if( $end >= strlen($text) )
 998                         $end = strlen($text);
 999                 else
1000                         $end = $this->position( $text, $end );
1001
1002                 if(!is_null($posStart))
1003                         $posStart = $start;
1004                 if(!is_null($posEnd))
1005                         $posEnd = $end;
1006
1007                 if($end > $start)
1008                         return substr($text, $start, $end-$start);
1009                 else
1010                         return '';
1011         }
1012
1013         /**
1014          * Find a nonletter near a point (index) in the text
1015          *
1016          * @param string $text
1017          * @param int $point
1018          * @param int $offset to found index
1019          * @return int nearest nonletter index, or beginning of utf8 char if none
1020          */
1021         function position($text, $point, $offset=0 ){
1022                 $tolerance = 10;
1023                 $s = max( 0, $point - $tolerance );
1024                 $l = min( strlen($text), $point + $tolerance ) - $s;
1025                 $m = array();
1026                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1027                         return $m[0][1] + $s + $offset;
1028                 } else{
1029                         // check if point is on a valid first UTF8 char
1030                         $char = ord( $text[$point] );
1031                         while( $char >= 0x80 && $char < 0xc0 ) {
1032                                 // skip trailing bytes
1033                                 $point++;
1034                                 if($point >= strlen($text))
1035                                         return strlen($text);
1036                                 $char = ord( $text[$point] );
1037                         }
1038                         return $point;
1039
1040                 }
1041         }
1042
1043         /**
1044          * Search extracts for a pattern, and return snippets
1045          *
1046          * @param string $pattern regexp for matching lines
1047          * @param array $extracts extracts to search
1048          * @param int $linesleft number of extracts to make
1049          * @param int $contextchars length of snippet
1050          * @param array $out map for highlighted snippets
1051          * @param array $offsets map of starting points of snippets
1052          * @protected
1053          */
1054         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1055                 if($linesleft == 0)
1056                         return; // nothing to do
1057                 foreach($extracts as $index => $line){
1058                         if( array_key_exists($index,$out) )
1059                                 continue; // this line already highlighted
1060
1061                         $m = array();
1062                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1063                                 continue;
1064
1065                         $offset = $m[0][1];
1066                         $len = strlen($m[0][0]);
1067                         if($offset + $len < $contextchars)
1068                                 $begin = 0;
1069                         elseif( $len > $contextchars)
1070                                 $begin = $offset;
1071                         else
1072                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1073
1074                         $end = $begin + $contextchars;
1075
1076                         $posBegin = $begin;
1077                         // basic snippet from this line
1078                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1079                         $offsets[$index] = $posBegin;
1080                         $linesleft--;
1081                         if($linesleft == 0)
1082                                 return;
1083                 }
1084         }
1085
1086         /**
1087          * Basic wikitext removal
1088          * @protected
1089          */
1090         function removeWiki($text) {
1091                 $fname = __METHOD__;
1092                 wfProfileIn( $fname );
1093
1094                 //$text = preg_replace("/'{2,5}/", "", $text);
1095                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1096                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1097                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1098                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1099                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1100                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1101                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1102                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1103                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1104                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1105                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1106                 $text = preg_replace("/'''''/", "", $text);
1107                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1108                 $text = preg_replace("/''/", "", $text);
1109
1110                 wfProfileOut( $fname );
1111                 return $text;
1112         }
1113
1114         /**
1115          * callback to replace [[target|caption]] kind of links, if
1116          * the target is category or image, leave it
1117          *
1118          * @param array $matches
1119          */
1120         function linkReplace($matches){
1121                 $colon = strpos( $matches[1], ':' );
1122                 if( $colon === false )
1123                         return $matches[2]; // replace with caption
1124                 global $wgContLang;
1125                 $ns = substr( $matches[1], 0, $colon );
1126                 $index = $wgContLang->getNsIndex($ns);
1127                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1128                         return $matches[0]; // return the whole thing
1129                 else
1130                         return $matches[2];
1131
1132         }
1133
1134         /**
1135      * Simple & fast snippet extraction, but gives completely unrelevant
1136      * snippets
1137      *
1138      * @param string $text
1139      * @param array $terms
1140      * @param int $contextlines
1141      * @param int $contextchars
1142      * @return string
1143      */
1144     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1145         global $wgLang, $wgContLang;
1146         $fname = __METHOD__;
1147
1148         $lines = explode( "\n", $text );
1149
1150         $terms = implode( '|', $terms );
1151         $max = intval( $contextchars ) + 1;
1152         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1153
1154         $lineno = 0;
1155
1156         $extract = "";
1157         wfProfileIn( "$fname-extract" );
1158         foreach ( $lines as $line ) {
1159             if ( 0 == $contextlines ) {
1160                 break;
1161             }
1162             ++$lineno;
1163             $m = array();
1164             if ( ! preg_match( $pat1, $line, $m ) ) {
1165                 continue;
1166             }
1167             --$contextlines;
1168             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1169
1170             if ( count( $m ) < 3 ) {
1171                 $post = '';
1172             } else {
1173                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1174             }
1175
1176             $found = $m[2];
1177
1178             $line = htmlspecialchars( $pre . $found . $post );
1179             $pat2 = '/(' . $terms . ")/i";
1180             $line = preg_replace( $pat2,
1181               "<span class='searchmatch'>\\1</span>", $line );
1182
1183             $extract .= "${line}\n";
1184         }
1185         wfProfileOut( "$fname-extract" );
1186
1187         return $extract;
1188     }
1189
1190 }
1191
1192 /**
1193  * Dummy class to be used when non-supported Database engine is present.
1194  * @fixme Dummy class should probably try something at least mildly useful,
1195  * such as a LIKE search through titles.
1196  * @ingroup Search
1197  */
1198 class SearchEngineDummy extends SearchEngine {
1199         // no-op
1200 }