includes/search/SearchUpdate.php

   1 <?php
   2 /**
   3  * This program is free software; you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation; either version 2 of the License, or
   6  * (at your option) any later version.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License along
  14  * with this program; if not, write to the Free Software Foundation, Inc.,
  15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16  * http://www.gnu.org/copyleft/gpl.html
  17  *
  18  * @file
  19  */
  20
  21 namespace MediaWiki\Search;
  22
  23 use MediaWiki\Content\Content;
  24 use MediaWiki\Deferred\DeferrableUpdate;
  25 use MediaWiki\Logger\LoggerFactory;
  26 use MediaWiki\MainConfigNames;
  27 use MediaWiki\MediaWikiServices;
  28 use MediaWiki\Page\ExistingPageRecord;
  29 use MediaWiki\Page\PageIdentity;
  30 use MediaWiki\Title\Title;
  31 use SearchEngine;
  32 use Wikimedia\Rdbms\IDBAccessObject;
  33
  34 /**
  35  * Database independent search index updater
  36  *
  37  * @internal
  38  * @ingroup Search
  39  */
  40 class SearchUpdate implements DeferrableUpdate {
  41         /** @var int Page id being updated */
  42         private $id = 0;
  43
  44         /** @var PageIdentity The page we're updating */
  45         private $page;
  46
  47         /** @var Content|null Content of the page (not text) */
  48         private $content;
  49
  50         /** @var ExistingPageRecord|null */
  51         private $latestPage = null;
  52
  53         /**
  54          * @param int $id Page id to update
  55          * @param PageIdentity $page Page to update
  56          * @param Content|null $c Content of the page to update.
  57          */
  58         public function __construct( $id, $page, ?Content $c = null ) {
  59                 $this->page = $page;
  60                 $this->id = $id;
  61                 $this->content = $c;
  62         }
  63
  64         /**
  65          * Perform actual update for the entry
  66          */
  67         public function doUpdate() {
  68                 $services = MediaWikiServices::getInstance();
  69                 $searchEngineConfig = $services->getSearchEngineConfig();
  70
  71                 if ( $services->getMainConfig()->get( MainConfigNames::DisableSearchUpdate ) || !$this->id ) {
  72                         LoggerFactory::getInstance( "search" )
  73                                 ->debug( "Skipping update: search updates disabled by config" );
  74                         return;
  75                 }
  76
  77                 $seFactory = $services->getSearchEngineFactory();
  78                 foreach ( $searchEngineConfig->getSearchTypes() as $type ) {
  79                         $search = $seFactory->create( $type );
  80                         if ( !$search->supports( 'search-update' ) ) {
  81                                 continue;
  82                         }
  83
  84                         $normalTitle = $this->getNormalizedTitle( $search );
  85
  86                         if ( $this->getLatestPage() === null ) {
  87                                 $search->delete( $this->id, $normalTitle );
  88                                 continue;
  89                         } elseif ( $this->content === null ) {
  90                                 $search->updateTitle( $this->id, $normalTitle );
  91                                 continue;
  92                         }
  93
  94                         $text = $this->content !== null ? $this->content->getTextForSearchIndex() : '';
  95                         $text = $this->updateText( $text, $search );
  96
  97                         # Perform the actual update
  98                         $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) );
  99                 }
 100         }
 101
 102         /**
 103          * Clean text for indexing. Only really suitable for indexing in databases.
 104          * If you're using a real search engine, you'll probably want to override
 105          * this behavior and do something nicer with the original wikitext.
 106          * @param string $text
 107          * @param SearchEngine|null $se Search engine
 108          * @return string
 109          */
 110         public function updateText( $text, ?SearchEngine $se = null ) {
 111                 $services = MediaWikiServices::getInstance();
 112                 $contLang = $services->getContentLanguage();
 113                 # Language-specific strip/conversion
 114                 $text = $contLang->normalizeForSearch( $text );
 115                 $se = $se ?: $services->newSearchEngine();
 116                 $lc = $se->legalSearchChars() . '&#;';
 117
 118                 # Strip HTML markup
 119                 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
 120                         ' ', $contLang->lc( " " . $text . " " ) );
 121                 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/",
 122                         "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings
 123
 124                 # Strip external URLs
 125                 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF";
 126                 $protos = "http|https|ftp|mailto|news|gopher";
 127                 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/";
 128                 $text = preg_replace( $pat, "\\1 \\3", $text );
 129
 130                 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/";
 131                 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/";
 132                 $text = preg_replace( $p1, "\\1 ", $text );
 133                 $text = preg_replace( $p2, "\\1 \\3 ", $text );
 134
 135                 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/",
 136                         "\\1\\2 \\2\\3", $text ); # Handle [[game]]s
 137
 138                 # Strip all remaining non-search characters
 139                 $text = preg_replace( "/[^{$lc}]+/", " ", $text );
 140
 141                 /**
 142                  * Handle 's, s'
 143                  *
 144                  *   $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text );
 145                  *   $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text );
 146                  *
 147                  * These tail-anchored regexps are very slow. The worst case comes
 148                  * when Japanese or Chinese text (ie, no word spacing) is written on
 149                  * a wiki configured for Western UTF-8 mode. The Unicode characters are
 150                  * expanded to hex codes and the "words" are very long paragraph-length
 151                  * monstrosities. On a large page the above regexps may take over 20
 152                  * seconds *each* on a 1GHz-level processor.
 153                  *
 154                  * Following are reversed versions which are consistently fast
 155                  * (about 3 milliseconds on 1GHz-level processor).
 156                  */
 157                 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) );
 158                 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) );
 159
 160                 # Strip wiki '' and '''
 161                 $text = preg_replace( "/''[']*/", " ", $text );
 162
 163                 return $text;
 164         }
 165
 166         /**
 167          * Get ExistingPageRecord for the SearchUpdate $id using IDBAccessObject::READ_LATEST
 168          * and ensure using the same ExistingPageRecord object if there are multiple
 169          * SearchEngine types.
 170          *
 171          * Returns null if a page has been deleted or is not found.
 172          *
 173          * @return ExistingPageRecord|null
 174          */
 175         private function getLatestPage() {
 176                 if ( !$this->latestPage ) {
 177                         $this->latestPage = MediaWikiServices::getInstance()->getPageStore()
 178                                 ->getPageById( $this->id, IDBAccessObject::READ_LATEST );
 179                 }
 180
 181                 return $this->latestPage;
 182         }
 183
 184         /**
 185          * Get a normalized string representation of a title suitable for
 186          * including in a search index
 187          *
 188          * @param SearchEngine $search
 189          * @return string A stripped-down title string ready for the search index
 190          */
 191         private function getNormalizedTitle( SearchEngine $search ) {
 192                 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
 193                 $title = Title::newFromPageIdentity( $this->page )->getText();
 194
 195                 $lc = $search->legalSearchChars() . '&#;';
 196                 $t = $contLang->normalizeForSearch( $title );
 197                 $t = preg_replace( "/[^{$lc}]+/", ' ', $t );
 198                 $t = $contLang->lc( $t );
 199
 200                 # Handle 's, s'
 201                 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t );
 202                 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t );
 203
 204                 $t = preg_replace( "/\\s+/", ' ', $t );
 205
 206                 return $search->normalizeText( trim( $t ) );
 207         }
 208 }