includes/search/SearchHighlighter.php

   1 <?php
   2 /**
   3  * Basic search engine highlighting
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup Search
  22  */
  23
  24 use MediaWiki\MainConfigNames;
  25 use MediaWiki\MediaWikiServices;
  26 use MediaWiki\Parser\Sanitizer;
  27 use MediaWiki\Registration\ExtensionRegistry;
  28
  29 /**
  30  * Highlight bits of wikitext
  31  *
  32  * @newable
  33  * @note marked as newable in 1.35 for lack of a better alternative,
  34  *       but should use a factory in the future.
  35  * @ingroup Search
  36  */
  37 class SearchHighlighter {
  38         public const DEFAULT_CONTEXT_LINES = 2;
  39         public const DEFAULT_CONTEXT_CHARS = 75;
  40
  41         /** @var bool */
  42         protected $mCleanWikitext = true;
  43
  44         /**
  45          * @stable to call
  46          * @warning If you pass false to this constructor, then
  47          *  the caller is responsible for HTML escaping.
  48          * @param bool $cleanupWikitext
  49          */
  50         public function __construct( $cleanupWikitext = true ) {
  51                 $this->mCleanWikitext = $cleanupWikitext;
  52         }
  53
  54         /**
  55          * Wikitext highlighting when $wgAdvancedSearchHighlighting = true
  56          *
  57          * @param string $text
  58          * @param string[] $terms Terms to highlight (not html escaped but
  59          *   regex escaped via SearchDatabase::regexTerm())
  60          * @param int $contextlines
  61          * @param int $contextchars
  62          * @return string
  63          */
  64         public function highlightText(
  65                 $text,
  66                 $terms,
  67                 $contextlines = self::DEFAULT_CONTEXT_LINES,
  68                 $contextchars = self::DEFAULT_CONTEXT_CHARS
  69         ) {
  70                 $searchHighlightBoundaries = MediaWikiServices::getInstance()
  71                         ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
  72
  73                 if ( $text == '' ) {
  74                         return '';
  75                 }
  76
  77                 // split text into text + templates/links/tables
  78                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  79                 // first capture group is for detecting nested templates/links/tables/references
  80                 $endPatterns = [
  81                         1 => '/(\{\{)|(\}\})/', // template
  82                         2 => '/(\[\[)|(\]\])/', // image
  83                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
  84
  85                 // @todo FIXME: This should prolly be a hook or something
  86                 // instead of hardcoding the name of the Cite extension
  87                 if ( ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
  88                         $spat .= '|(<ref>)'; // references via cite extension
  89                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  90                 }
  91                 $spat .= '/';
  92                 $textExt = []; // text extracts
  93                 $otherExt = []; // other extracts
  94                 $start = 0;
  95                 $textLen = strlen( $text );
  96                 $count = 0; // sequence number to maintain ordering
  97                 while ( $start < $textLen ) {
  98                         // find start of template/image/table
  99                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
 100                                 $epat = '';
 101                                 foreach ( $matches as $key => $val ) {
 102                                         if ( $key > 0 && $val[1] != -1 ) {
 103                                                 if ( $key == 2 ) {
 104                                                         // see if this is an image link
 105                                                         $ns = substr( $val[0], 2, -1 );
 106                                                         if (
 107                                                                 MediaWikiServices::getInstance()->getContentLanguage()->
 108                                                                 getNsIndex( $ns ) !== NS_FILE
 109                                                         ) {
 110                                                                 break;
 111                                                         }
 112
 113                                                 }
 114                                                 $epat = $endPatterns[$key];
 115                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 116                                                 $start = $val[1];
 117                                                 break;
 118                                         }
 119                                 }
 120                                 if ( $epat ) {
 121                                         // find end (and detect any nested elements)
 122                                         $level = 0;
 123                                         $offset = $start + 1;
 124                                         $found = false;
 125                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
 126                                                 if ( array_key_exists( 2, $endMatches ) ) {
 127                                                         // found end
 128                                                         if ( $level == 0 ) {
 129                                                                 $len = strlen( $endMatches[2][0] );
 130                                                                 $off = $endMatches[2][1];
 131                                                                 $this->splitAndAdd( $otherExt, $count,
 132                                                                         substr( $text, $start, $off + $len - $start ) );
 133                                                                 $start = $off + $len;
 134                                                                 $found = true;
 135                                                                 break;
 136                                                         } else {
 137                                                                 // end of nested element
 138                                                                 $level--;
 139                                                         }
 140                                                 } else {
 141                                                         // nested
 142                                                         $level++;
 143                                                 }
 144                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
 145                                         }
 146                                         if ( !$found ) {
 147                                                 // couldn't find appropriate closing tag, skip
 148                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
 149                                                 $start += strlen( $matches[0][0] );
 150                                         }
 151                                         continue;
 152                                 }
 153                         }
 154                         // else: add as text extract
 155                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
 156                         break;
 157                 }
 158                 '@phan-var string[] $textExt';
 159
 160                 $all = $textExt + $otherExt; // these have disjunct key sets
 161
 162                 // prepare regexps
 163                 foreach ( $terms as $index => $term ) {
 164                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 165                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
 166                                 $terms[$index] = preg_replace_callback(
 167                                         '/./us',
 168                                         [ $this, 'caseCallback' ],
 169                                         $terms[$index]
 170                                 );
 171                         } else {
 172                                 $terms[$index] = $term;
 173                         }
 174                 }
 175                 $anyterm = implode( '|', $terms );
 176                 $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
 177                 // @todo FIXME: A hack to scale contextchars, a correct solution
 178                 // would be to have contextchars actually be char and not byte
 179                 // length, and do proper utf-8 substrings and lengths everywhere,
 180                 // but PHP is making that very hard and unclean to implement :(
 181                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
 182                 $contextchars = intval( $contextchars * $scale );
 183
 184                 $patPre = "(^|{$searchHighlightBoundaries})";
 185                 $patPost = "({$searchHighlightBoundaries}|$)";
 186
 187                 $pat1 = "/(" . $phrase . ")/ui";
 188                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
 189
 190                 $left = $contextlines;
 191
 192                 $snippets = [];
 193                 $offsets = [];
 194
 195                 // show beginning only if it contains all words
 196                 $first = 0;
 197                 $firstText = '';
 198                 foreach ( $textExt as $index => $line ) {
 199                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
 200                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 201                                 $first = $index;
 202                                 break;
 203                         }
 204                 }
 205                 if ( $firstText ) {
 206                         $succ = true;
 207                         // check if first text contains all terms
 208                         foreach ( $terms as $term ) {
 209                                 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
 210                                         $succ = false;
 211                                         break;
 212                                 }
 213                         }
 214                         if ( $succ ) {
 215                                 $snippets[$first] = $firstText;
 216                                 $offsets[$first] = 0;
 217                         }
 218                 }
 219                 if ( !$snippets ) {
 220                         // match whole query on text
 221                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
 222                         // match whole query on templates/tables/images
 223                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
 224                         // match any words on text
 225                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
 226                         // match any words on templates/tables/images
 227                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
 228
 229                         ksort( $snippets );
 230                 }
 231
 232                 // add extra chars to each snippet to make snippets constant size
 233                 $extended = [];
 234                 if ( count( $snippets ) == 0 ) {
 235                         // couldn't find the target words, just show beginning of article
 236                         if ( array_key_exists( $first, $all ) ) {
 237                                 $targetchars = $contextchars * $contextlines;
 238                                 $snippets[$first] = '';
 239                                 $offsets[$first] = 0;
 240                         }
 241                 } else {
 242                         // if begin of the article contains the whole phrase, show only that !!
 243                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
 244                                 && $offsets[$first] < $contextchars * 2 ) {
 245                                 $snippets = [ $first => $snippets[$first] ];
 246                         }
 247
 248                         // calc by how much to extend existing snippets
 249                         $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
 250                 }
 251
 252                 foreach ( $snippets as $index => $line ) {
 253                         $extended[$index] = $line;
 254                         $len = strlen( $line );
 255                         // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
 256                         // $targetchars is set when $snippes contains anything
 257                         if ( $len < $targetchars - 20 ) {
 258                                 // complete this line
 259                                 if ( $len < strlen( $all[$index] ) ) {
 260                                         $extended[$index] = $this->extract(
 261                                                 $all[$index],
 262                                                 $offsets[$index],
 263                                                 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
 264                                                 // $targetchars is set when $snippes contains anything
 265                                                 $offsets[$index] + $targetchars,
 266                                                 $offsets[$index]
 267                                         );
 268                                         $len = strlen( $extended[$index] );
 269                                 }
 270
 271                                 // add more lines
 272                                 $add = $index + 1;
 273                                 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
 274                                 // $targetchars is set when $snippes contains anything
 275                                 while ( $len < $targetchars - 20
 276                                                 && array_key_exists( $add, $all )
 277                                                 && !array_key_exists( $add, $snippets ) ) {
 278                                         $offsets[$add] = 0;
 279                                         // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
 280                                         // $targetchars is set when $snippes contains anything
 281                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 282                                         $extended[$add] = $tt;
 283                                         $len += strlen( $tt );
 284                                         $add++;
 285                                 }
 286                         }
 287                 }
 288
 289                 // $snippets = array_map( 'htmlspecialchars', $extended );
 290                 $snippets = $extended;
 291                 $last = -1;
 292                 $extract = '';
 293                 foreach ( $snippets as $index => $line ) {
 294                         if ( $last == -1 ) {
 295                                 $extract .= $line; // first line
 296                         } elseif ( $last + 1 == $index
 297                                 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
 298                         ) {
 299                                 $extract .= " " . $line; // continuous lines
 300                         } else {
 301                                 $extract .= '<b> ... </b>' . $line;
 302                         }
 303
 304                         $last = $index;
 305                 }
 306                 if ( $extract ) {
 307                         $extract .= '<b> ... </b>';
 308                 }
 309
 310                 $processed = [];
 311                 foreach ( $terms as $term ) {
 312                         if ( !isset( $processed[$term] ) ) {
 313                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
 314                                 $extract = preg_replace( $pat3,
 315                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 316                                 $processed[$term] = true;
 317                         }
 318                 }
 319
 320                 return $extract;
 321         }
 322
 323         /**
 324          * Split text into lines and add it to extracts array
 325          *
 326          * @param string[] &$extracts Index -> $line
 327          * @param int &$count
 328          * @param string $text
 329          */
 330         private function splitAndAdd( &$extracts, &$count, $text ) {
 331                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
 332                 foreach ( $split as $line ) {
 333                         $tt = trim( $line );
 334                         if ( $tt ) {
 335                                 $extracts[$count++] = $tt;
 336                         }
 337                 }
 338         }
 339
 340         /**
 341          * Do manual case conversion for non-ascii chars
 342          *
 343          * @param array $matches
 344          * @return string
 345          */
 346         private function caseCallback( $matches ) {
 347                 if ( strlen( $matches[0] ) > 1 ) {
 348                         $contLang = MediaWikiServices::getInstance()->getContentLanguage();
 349                         return '[' . $contLang->lc( $matches[0] ) .
 350                                 $contLang->uc( $matches[0] ) . ']';
 351                 } else {
 352                         return $matches[0];
 353                 }
 354         }
 355
 356         /**
 357          * Extract part of the text from start to end, but by
 358          * not chopping up words
 359          * @param string $text
 360          * @param int $start
 361          * @param int $end
 362          * @param int|null &$posStart (out) actual start position
 363          * @param int|null &$posEnd (out) actual end position
 364          * @return string
 365          */
 366         private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
 367                 if ( $start != 0 ) {
 368                         $start = $this->position( $text, $start, 1 );
 369                 }
 370                 if ( $end >= strlen( $text ) ) {
 371                         $end = strlen( $text );
 372                 } else {
 373                         $end = $this->position( $text, $end );
 374                 }
 375
 376                 if ( $posStart !== null ) {
 377                         $posStart = $start;
 378                 }
 379                 if ( $posEnd !== null ) {
 380                         $posEnd = $end;
 381                 }
 382
 383                 if ( $end > $start ) {
 384                         return substr( $text, $start, $end - $start );
 385                 } else {
 386                         return '';
 387                 }
 388         }
 389
 390         /**
 391          * Find a nonletter near a point (index) in the text
 392          *
 393          * @param string $text
 394          * @param int $point
 395          * @param int $offset Offset to found index
 396          * @return int Nearest nonletter index, or beginning of utf8 char if none
 397          */
 398         private function position( $text, $point, $offset = 0 ) {
 399                 $tolerance = 10;
 400                 $s = max( 0, $point - $tolerance );
 401                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
 402                 $m = [];
 403
 404                 if ( preg_match(
 405                         '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
 406                         substr( $text, $s, $l ),
 407                         $m,
 408                         PREG_OFFSET_CAPTURE
 409                 ) ) {
 410                         return $m[0][1] + $s + $offset;
 411                 } else {
 412                         // check if point is on a valid first UTF8 char
 413                         $char = ord( $text[$point] );
 414                         while ( $char >= 0x80 && $char < 0xc0 ) {
 415                                 // skip trailing bytes
 416                                 $point++;
 417                                 if ( $point >= strlen( $text ) ) {
 418                                         return strlen( $text );
 419                                 }
 420                                 $char = ord( $text[$point] );
 421                         }
 422
 423                         return $point;
 424
 425                 }
 426         }
 427
 428         /**
 429          * Search extracts for a pattern, and return snippets
 430          *
 431          * @param string $pattern Regexp for matching lines
 432          * @param array $extracts Extracts to search
 433          * @param int &$linesleft Number of extracts to make
 434          * @param int &$contextchars Length of snippet
 435          * @param array &$out Map for highlighted snippets
 436          * @param array &$offsets Map of starting points of snippets
 437          */
 438         private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
 439                 if ( $linesleft == 0 ) {
 440                         return; // nothing to do
 441                 }
 442                 foreach ( $extracts as $index => $line ) {
 443                         if ( array_key_exists( $index, $out ) ) {
 444                                 continue; // this line already highlighted
 445                         }
 446
 447                         $m = [];
 448                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
 449                                 continue;
 450                         }
 451
 452                         $offset = $m[0][1];
 453                         $len = strlen( $m[0][0] );
 454                         if ( $offset + $len < $contextchars ) {
 455                                 $begin = 0;
 456                         } elseif ( $len > $contextchars ) {
 457                                 $begin = $offset;
 458                         } else {
 459                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
 460                         }
 461
 462                         $end = $begin + $contextchars;
 463
 464                         $posBegin = $begin;
 465                         // basic snippet from this line
 466                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
 467                         $offsets[$index] = $posBegin;
 468                         $linesleft--;
 469                         if ( $linesleft == 0 ) {
 470                                 return;
 471                         }
 472                 }
 473         }
 474
 475         /**
 476          * Basic wikitext removal
 477          * @param string $text
 478          * @return string
 479          */
 480         private function removeWiki( $text ) {
 481                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
 482                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
 483                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
 484                 $text = preg_replace_callback(
 485                         "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
 486                         [ $this, 'linkReplace' ],
 487                         $text
 488                 );
 489                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
 490                 $text = preg_replace( "/'''''/", "", $text );
 491                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
 492                 $text = preg_replace( "/''/", "", $text );
 493
 494                 // Note, the previous /<\/?[^>]+>/ is insufficient
 495                 // for XSS safety as the HTML tag can span multiple
 496                 // search results (T144845).
 497                 $text = Sanitizer::escapeHtmlAllowEntities( $text );
 498                 return $text;
 499         }
 500
 501         /**
 502          * callback to replace [[target|caption]] kind of links, if
 503          * the target is category or image, leave it
 504          *
 505          * @param array $matches
 506          * @return string
 507          */
 508         private function linkReplace( $matches ) {
 509                 $colon = strpos( $matches[1], ':' );
 510                 if ( $colon === false ) {
 511                         return $matches[2]; // replace with caption
 512                 }
 513                 $ns = substr( $matches[1], 0, $colon );
 514                 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
 515                 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
 516                         return $matches[0]; // return the whole thing
 517                 } else {
 518                         return $matches[2];
 519                 }
 520         }
 521
 522         /**
 523          * Simple & fast snippet extraction, but gives completely irrelevant
 524          * snippets
 525          *
 526          * Used when $wgAdvancedSearchHighlighting is false.
 527          *
 528          * @param string $text
 529          * @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
 530          * @param int $contextlines
 531          * @param int $contextchars
 532          * @return string
 533          */
 534         public function highlightSimple(
 535                 $text,
 536                 $terms,
 537                 $contextlines = self::DEFAULT_CONTEXT_LINES,
 538                 $contextchars = self::DEFAULT_CONTEXT_CHARS
 539         ) {
 540                 $lines = explode( "\n", $text );
 541
 542                 $terms = implode( '|', $terms );
 543                 $max = intval( $contextchars ) + 1;
 544                 $pat1 = "/(.*)($terms)(.{0,$max})/ui";
 545
 546                 $extract = '';
 547                 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
 548                 foreach ( $lines as $line ) {
 549                         if ( $contextlines == 0 ) {
 550                                 break;
 551                         }
 552                         $m = [];
 553                         if ( !preg_match( $pat1, $line, $m ) ) {
 554                                 continue;
 555                         }
 556                         --$contextlines;
 557                         // truncate function changes ... to relevant i18n message.
 558                         $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
 559
 560                         if ( count( $m ) < 3 ) {
 561                                 $post = '';
 562                         } else {
 563                                 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
 564                         }
 565
 566                         $found = $m[2];
 567
 568                         $line = htmlspecialchars( $pre . $found . $post );
 569                         $pat2 = '/(' . $terms . ')/ui';
 570                         $line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
 571
 572                         $extract .= "{$line}\n";
 573                 }
 574
 575                 return $extract;
 576         }
 577
 578         /**
 579          * Returns the first few lines of the text
 580          *
 581          * @param string $text
 582          * @param int $contextlines Max number of returned lines
 583          * @param int $contextchars Average number of characters per line
 584          * @return string
 585          */
 586         public function highlightNone(
 587                 $text,
 588                 $contextlines = self::DEFAULT_CONTEXT_LINES,
 589                 $contextchars = self::DEFAULT_CONTEXT_CHARS
 590         ) {
 591                 $match = [];
 592                 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
 593                 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
 594                 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
 595
 596                 // Trim and limit to max number of chars
 597                 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
 598                 return str_replace( "\n", '<br>', $text );
 599         }
 600 }