includes/search/SearchHighlighter.php

   1 <?php
   2 /**
   3  * Basic search engine highlighting
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Search
  22  */
  23
  24 /**
  25  * Highlight bits of wikitext
  26  *
  27  * @ingroup Search
  28  */
  29 class SearchHighlighter {
  30         protected $mCleanWikitext = true;
  31
  32         function __construct( $cleanupWikitext = true ) {
  33                 $this->mCleanWikitext = $cleanupWikitext;
  34         }
  35
  36         /**
  37          * Default implementation of wikitext highlighting
  38          *
  39          * @param string $text
  40          * @param array $terms Terms to highlight (unescaped)
  41          * @param int $contextlines
  42          * @param int $contextchars
  43          * @return string
  44          */
  45         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
  46                 global $wgContLang, $wgSearchHighlightBoundaries;
  47
  48                 if ( $text == '' ) {
  49                         return '';
  50                 }
  51
  52                 // spli text into text + templates/links/tables
  53                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  54                 // first capture group is for detecting nested templates/links/tables/references
  55                 $endPatterns = array(
  56                         1 => '/(\{\{)|(\}\})/', // template
  57                         2 => '/(\[\[)|(\]\])/', // image
  58                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
  59
  60                 // @todo FIXME: This should prolly be a hook or something
  61                 // instead of hardcoding a class name from the Cite extension
  62                 if ( class_exists( 'Cite' ) ) {
  63                         $spat .= '|(<ref>)'; // references via cite extension
  64                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  65                 }
  66                 $spat .= '/';
  67                 $textExt = array(); // text extracts
  68                 $otherExt = array(); // other extracts
  69                 $start = 0;
  70                 $textLen = strlen( $text );
  71                 $count = 0; // sequence number to maintain ordering
  72                 while ( $start < $textLen ) {
  73                         // find start of template/image/table
  74                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
  75                                 $epat = '';
  76                                 foreach ( $matches as $key => $val ) {
  77                                         if ( $key > 0 && $val[1] != - 1 ) {
  78                                                 if ( $key == 2 ) {
  79                                                         // see if this is an image link
  80                                                         $ns = substr( $val[0], 2, - 1 );
  81                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
  82                                                                 break;
  83                                                         }
  84
  85                                                 }
  86                                                 $epat = $endPatterns[$key];
  87                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
  88                                                 $start = $val[1];
  89                                                 break;
  90                                         }
  91                                 }
  92                                 if ( $epat ) {
  93                                         // find end (and detect any nested elements)
  94                                         $level = 0;
  95                                         $offset = $start + 1;
  96                                         $found = false;
  97                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
  98                                                 if ( array_key_exists( 2, $endMatches ) ) {
  99                                                         // found end
 100                                                         if ( $level == 0 ) {
 101                                                                 $len = strlen( $endMatches[2][0] );
 102                                                                 $off = $endMatches[2][1];
 103                                                                 $this->splitAndAdd( $otherExt, $count,
 104                                                                         substr( $text, $start, $off + $len - $start ) );
 105                                                                 $start = $off + $len;
 106                                                                 $found = true;
 107                                                                 break;
 108                                                         } else {
 109                                                                 // end of nested element
 110                                                                 $level -= 1;
 111                                                         }
 112                                                 } else {
 113                                                         // nested
 114                                                         $level += 1;
 115                                                 }
 116                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
 117                                         }
 118                                         if ( !$found ) {
 119                                                 // couldn't find appropriate closing tag, skip
 120                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
 121                                                 $start += strlen( $matches[0][0] );
 122                                         }
 123                                         continue;
 124                                 }
 125                         }
 126                         // else: add as text extract
 127                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
 128                         break;
 129                 }
 130
 131                 $all = $textExt + $otherExt; // these have disjunct key sets
 132
 133                 // prepare regexps
 134                 foreach ( $terms as $index => $term ) {
 135                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 136                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
 137                                 $terms[$index] = preg_replace_callback(
 138                                         '/./us',
 139                                         array( $this, 'caseCallback' ),
 140                                         $terms[$index]
 141                                 );
 142                         } else {
 143                                 $terms[$index] = $term;
 144                         }
 145                 }
 146                 $anyterm = implode( '|', $terms );
 147                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
 148
 149                 // @todo FIXME: A hack to scale contextchars, a correct solution
 150                 // would be to have contextchars actually be char and not byte
 151                 // length, and do proper utf-8 substrings and lengths everywhere,
 152                 // but PHP is making that very hard and unclean to implement :(
 153                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
 154                 $contextchars = intval( $contextchars * $scale );
 155
 156                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 157                 $patPost = "($wgSearchHighlightBoundaries|$)";
 158
 159                 $pat1 = "/(" . $phrase . ")/ui";
 160                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
 161
 162                 $left = $contextlines;
 163
 164                 $snippets = array();
 165                 $offsets = array();
 166
 167                 // show beginning only if it contains all words
 168                 $first = 0;
 169                 $firstText = '';
 170                 foreach ( $textExt as $index => $line ) {
 171                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
 172                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 173                                 $first = $index;
 174                                 break;
 175                         }
 176                 }
 177                 if ( $firstText ) {
 178                         $succ = true;
 179                         // check if first text contains all terms
 180                         foreach ( $terms as $term ) {
 181                                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
 182                                         $succ = false;
 183                                         break;
 184                                 }
 185                         }
 186                         if ( $succ ) {
 187                                 $snippets[$first] = $firstText;
 188                                 $offsets[$first] = 0;
 189                         }
 190                 }
 191                 if ( !$snippets ) {
 192                         // match whole query on text
 193                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
 194                         // match whole query on templates/tables/images
 195                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
 196                         // match any words on text
 197                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
 198                         // match any words on templates/tables/images
 199                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
 200
 201                         ksort( $snippets );
 202                 }
 203
 204                 // add extra chars to each snippet to make snippets constant size
 205                 $extended = array();
 206                 if ( count( $snippets ) == 0 ) {
 207                         // couldn't find the target words, just show beginning of article
 208                         if ( array_key_exists( $first, $all ) ) {
 209                                 $targetchars = $contextchars * $contextlines;
 210                                 $snippets[$first] = '';
 211                                 $offsets[$first] = 0;
 212                         }
 213                 } else {
 214                         // if begin of the article contains the whole phrase, show only that !!
 215                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
 216                                 && $offsets[$first] < $contextchars * 2 ) {
 217                                 $snippets = array( $first => $snippets[$first] );
 218                         }
 219
 220                         // calc by how much to extend existing snippets
 221                         $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
 222                 }
 223
 224                 foreach ( $snippets as $index => $line ) {
 225                         $extended[$index] = $line;
 226                         $len = strlen( $line );
 227                         if ( $len < $targetchars - 20 ) {
 228                                 // complete this line
 229                                 if ( $len < strlen( $all[$index] ) ) {
 230                                         $extended[$index] = $this->extract(
 231                                                 $all[$index],
 232                                                 $offsets[$index],
 233                                                 $offsets[$index] + $targetchars,
 234                                                 $offsets[$index]
 235                                         );
 236                                         $len = strlen( $extended[$index] );
 237                                 }
 238
 239                                 // add more lines
 240                                 $add = $index + 1;
 241                                 while ( $len < $targetchars - 20
 242                                                 && array_key_exists( $add, $all )
 243                                                 && !array_key_exists( $add, $snippets ) ) {
 244                                         $offsets[$add] = 0;
 245                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 246                                         $extended[$add] = $tt;
 247                                         $len += strlen( $tt );
 248                                         $add++;
 249                                 }
 250                         }
 251                 }
 252
 253                 // $snippets = array_map( 'htmlspecialchars', $extended );
 254                 $snippets = $extended;
 255                 $last = - 1;
 256                 $extract = '';
 257                 foreach ( $snippets as $index => $line ) {
 258                         if ( $last == - 1 ) {
 259                                 $extract .= $line; // first line
 260                         } elseif ( $last + 1 == $index
 261                                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
 262                         ) {
 263                                 $extract .= " " . $line; // continous lines
 264                         } else {
 265                                 $extract .= '<b> ... </b>' . $line;
 266                         }
 267
 268                         $last = $index;
 269                 }
 270                 if ( $extract ) {
 271                         $extract .= '<b> ... </b>';
 272                 }
 273
 274                 $processed = array();
 275                 foreach ( $terms as $term ) {
 276                         if ( !isset( $processed[$term] ) ) {
 277                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
 278                                 $extract = preg_replace( $pat3,
 279                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 280                                 $processed[$term] = true;
 281                         }
 282                 }
 283
 284                 return $extract;
 285         }
 286
 287         /**
 288          * Split text into lines and add it to extracts array
 289          *
 290          * @param array $extracts Index -> $line
 291          * @param int $count
 292          * @param string $text
 293          */
 294         function splitAndAdd( &$extracts, &$count, $text ) {
 295                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
 296                 foreach ( $split as $line ) {
 297                         $tt = trim( $line );
 298                         if ( $tt ) {
 299                                 $extracts[$count++] = $tt;
 300                         }
 301                 }
 302         }
 303
 304         /**
 305          * Do manual case conversion for non-ascii chars
 306          *
 307          * @param array $matches
 308          * @return string
 309          */
 310         function caseCallback( $matches ) {
 311                 global $wgContLang;
 312                 if ( strlen( $matches[0] ) > 1 ) {
 313                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
 314                 } else {
 315                         return $matches[0];
 316                 }
 317         }
 318
 319         /**
 320          * Extract part of the text from start to end, but by
 321          * not chopping up words
 322          * @param string $text
 323          * @param int $start
 324          * @param int $end
 325          * @param int $posStart (out) actual start position
 326          * @param int $posEnd (out) actual end position
 327          * @return string
 328          */
 329         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
 330                 if ( $start != 0 ) {
 331                         $start = $this->position( $text, $start, 1 );
 332                 }
 333                 if ( $end >= strlen( $text ) ) {
 334                         $end = strlen( $text );
 335                 } else {
 336                         $end = $this->position( $text, $end );
 337                 }
 338
 339                 if ( !is_null( $posStart ) ) {
 340                         $posStart = $start;
 341                 }
 342                 if ( !is_null( $posEnd ) ) {
 343                         $posEnd = $end;
 344                 }
 345
 346                 if ( $end > $start ) {
 347                         return substr( $text, $start, $end - $start );
 348                 } else {
 349                         return '';
 350                 }
 351         }
 352
 353         /**
 354          * Find a nonletter near a point (index) in the text
 355          *
 356          * @param string $text
 357          * @param int $point
 358          * @param int $offset Offset to found index
 359          * @return int Nearest nonletter index, or beginning of utf8 char if none
 360          */
 361         function position( $text, $point, $offset = 0 ) {
 362                 $tolerance = 10;
 363                 $s = max( 0, $point - $tolerance );
 364                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
 365                 $m = array();
 366
 367                 if ( preg_match(
 368                         '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
 369                         substr( $text, $s, $l ),
 370                         $m,
 371                         PREG_OFFSET_CAPTURE
 372                 ) ) {
 373                         return $m[0][1] + $s + $offset;
 374                 } else {
 375                         // check if point is on a valid first UTF8 char
 376                         $char = ord( $text[$point] );
 377                         while ( $char >= 0x80 && $char < 0xc0 ) {
 378                                 // skip trailing bytes
 379                                 $point++;
 380                                 if ( $point >= strlen( $text ) ) {
 381                                         return strlen( $text );
 382                                 }
 383                                 $char = ord( $text[$point] );
 384                         }
 385
 386                         return $point;
 387
 388                 }
 389         }
 390
 391         /**
 392          * Search extracts for a pattern, and return snippets
 393          *
 394          * @param string $pattern Regexp for matching lines
 395          * @param array $extracts Extracts to search
 396          * @param int $linesleft Number of extracts to make
 397          * @param int $contextchars Length of snippet
 398          * @param array $out Map for highlighted snippets
 399          * @param array $offsets Map of starting points of snippets
 400          * @protected
 401          */
 402         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
 403                 if ( $linesleft == 0 ) {
 404                         return; // nothing to do
 405                 }
 406                 foreach ( $extracts as $index => $line ) {
 407                         if ( array_key_exists( $index, $out ) ) {
 408                                 continue; // this line already highlighted
 409                         }
 410
 411                         $m = array();
 412                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
 413                                 continue;
 414                         }
 415
 416                         $offset = $m[0][1];
 417                         $len = strlen( $m[0][0] );
 418                         if ( $offset + $len < $contextchars ) {
 419                                 $begin = 0;
 420                         } elseif ( $len > $contextchars ) {
 421                                 $begin = $offset;
 422                         } else {
 423                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
 424                         }
 425
 426                         $end = $begin + $contextchars;
 427
 428                         $posBegin = $begin;
 429                         // basic snippet from this line
 430                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
 431                         $offsets[$index] = $posBegin;
 432                         $linesleft--;
 433                         if ( $linesleft == 0 ) {
 434                                 return;
 435                         }
 436                 }
 437         }
 438
 439         /**
 440          * Basic wikitext removal
 441          * @protected
 442          * @param string $text
 443          * @return mixed
 444          */
 445         function removeWiki( $text ) {
 446                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
 447                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
 448                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
 449                 $text = preg_replace_callback(
 450                         "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
 451                         array( $this, 'linkReplace' ),
 452                         $text
 453                 );
 454                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
 455                 $text = preg_replace( "/'''''/", "", $text );
 456                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
 457                 $text = preg_replace( "/''/", "", $text );
 458
 459                 return $text;
 460         }
 461
 462         /**
 463          * callback to replace [[target|caption]] kind of links, if
 464          * the target is category or image, leave it
 465          *
 466          * @param array $matches
 467          * @return string
 468          */
 469         function linkReplace( $matches ) {
 470                 $colon = strpos( $matches[1], ':' );
 471                 if ( $colon === false ) {
 472                         return $matches[2]; // replace with caption
 473                 }
 474                 global $wgContLang;
 475                 $ns = substr( $matches[1], 0, $colon );
 476                 $index = $wgContLang->getNsIndex( $ns );
 477                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
 478                         return $matches[0]; // return the whole thing
 479                 } else {
 480                         return $matches[2];
 481                 }
 482         }
 483
 484         /**
 485          * Simple & fast snippet extraction, but gives completely unrelevant
 486          * snippets
 487          *
 488          * @param string $text
 489          * @param array $terms
 490          * @param int $contextlines
 491          * @param int $contextchars
 492          * @return string
 493          */
 494         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
 495                 global $wgContLang;
 496
 497                 $lines = explode( "\n", $text );
 498
 499                 $terms = implode( '|', $terms );
 500                 $max = intval( $contextchars ) + 1;
 501                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
 502
 503                 $lineno = 0;
 504
 505                 $extract = "";
 506                 foreach ( $lines as $line ) {
 507                         if ( 0 == $contextlines ) {
 508                                 break;
 509                         }
 510                         ++$lineno;
 511                         $m = array();
 512                         if ( !preg_match( $pat1, $line, $m ) ) {
 513                                 continue;
 514                         }
 515                         --$contextlines;
 516                         // truncate function changes ... to relevant i18n message.
 517                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
 518
 519                         if ( count( $m ) < 3 ) {
 520                                 $post = '';
 521                         } else {
 522                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
 523                         }
 524
 525                         $found = $m[2];
 526
 527                         $line = htmlspecialchars( $pre . $found . $post );
 528                         $pat2 = '/(' . $terms . ")/i";
 529                         $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
 530
 531                         $extract .= "${line}\n";
 532                 }
 533
 534                 return $extract;
 535         }
 536
 537         /**
 538          * Returns the first few lines of the text
 539          *
 540          * @param string $text
 541          * @param int $contextlines Max number of returned lines
 542          * @param int $contextchars Average number of characters per line
 543          * @return string
 544          */
 545         public function highlightNone( $text, $contextlines, $contextchars ) {
 546                 $match = array();
 547                 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
 548                 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
 549                 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
 550
 551                 // Trim and limit to max number of chars
 552                 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
 553                 return str_replace( "\n", '<br>', $text );
 554         }
 555 }