includes/content/WikiTextStructure.php

   1 <?php
   2
   3 namespace MediaWiki\Content;
   4
   5 use HtmlFormatter\HtmlFormatter;
   6 use MediaWiki\Parser\ParserOutput;
   7 use MediaWiki\Parser\Sanitizer;
   8
   9 /**
  10  * Class allowing to explore the structure of parsed wikitext.
  11  */
  12 class WikiTextStructure {
  13
  14         private ?string $openingText = null;
  15         private ?string $allText = null;
  16         /** @var string[] */
  17         private array $auxText = [];
  18         private ParserOutput $parserOutput;
  19
  20         /**
  21          * Selectors to elements that are excluded entirely from search
  22          */
  23         private const EXCLUDED_ELEMENT_SELECTORS = [
  24                 // "it looks like you don't have javascript enabled..." – do not need to index
  25                 'audio', 'video',
  26                 // CSS stylesheets aren't content
  27                 'style',
  28                 // The [1] for references from Cite
  29                 'sup.reference',
  30                 // The ↑ next to references in the references section from Cite
  31                 '.mw-cite-backlink',
  32                 // Headings are already indexed in their own field.
  33                 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  34                 // Collapsed fields are hidden by default, so we don't want them showing up.
  35                 '.autocollapse',
  36                 // Content explicitly decided to be not searchable by editors such
  37                 // as custom navigation templates.
  38                 '.navigation-not-searchable',
  39                 // User-facing interface code prompting the user to act from WikibaseMediaInfo
  40                 '.wbmi-entityview-emptyCaption',
  41         ];
  42
  43         /**
  44          * Selectors to elements that are considered auxiliary to the article text for search
  45          */
  46         private const AUXILIARY_ELEMENT_SELECTORS = [
  47                 // Thumbnail captions aren't really part of the text proper
  48                 '.thumbcaption',
  49                 'figcaption',
  50                 // Neither are tables
  51                 'table',
  52                 // Common style for "See also:".
  53                 '.rellink',
  54                 // Common style for calling out helpful links at the top of the article.
  55                 '.dablink',
  56                 // New class users can use to mark stuff as auxiliary to searches.
  57                 '.searchaux',
  58         ];
  59
  60         /**
  61          * @param ParserOutput $parserOutput
  62          */
  63         public function __construct( ParserOutput $parserOutput ) {
  64                 $this->parserOutput = $parserOutput;
  65         }
  66
  67         /**
  68          * Gets headings from the page.
  69          *
  70          * @return string[]
  71          * First strip out things that look like references.  We can't use HTML filtering because
  72          * the references come back as <sup> tags without a class.  To keep from breaking stuff like
  73          *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
  74          * we don't remove the whole <sup> tag.
  75          *
  76          * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
  77          * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
  78          *
  79          * So we only strip things that look like <sup> tags wrapping a reference. And since the data
  80          * looks like:
  81          *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
  82          * we can not really use HtmlFormatter as we have no suitable selector.
  83          */
  84         public function headings() {
  85                 $headings = [];
  86                 $tocData = $this->parserOutput->getTOCData();
  87                 if ( $tocData === null ) {
  88                         return $headings;
  89                 }
  90                 $ignoredHeadings = $this->getIgnoredHeadings();
  91                 foreach ( $tocData->getSections() as $heading ) {
  92                         $heading = $heading->line;
  93
  94                         // Some wikis wrap the brackets in a span:
  95                         // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
  96                         $heading = preg_replace( '/<\/?span>/', '', $heading );
  97                         // Normalize [] so the following regexp would work.
  98                         $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
  99                         $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
 100
 101                         // Strip tags from the heading or else we'll display them (escaped) in search results
 102                         $heading = trim( Sanitizer::stripAllTags( $heading ) );
 103
 104                         // Note that we don't take the level of the heading into account - all headings are equal.
 105                         // Except the ones we ignore.
 106                         if ( !in_array( $heading, $ignoredHeadings ) ) {
 107                                 $headings[] = $heading;
 108                         }
 109                 }
 110
 111                 return $headings;
 112         }
 113
 114         /**
 115          * Parse a message content into an array. This function is generally used to
 116          * parse settings stored as i18n messages (see search-ignored-headings).
 117          *
 118          * @param string $message
 119          *
 120          * @return string[]
 121          */
 122         public static function parseSettingsInMessage( $message ) {
 123                 $lines = explode( "\n", $message );
 124                 // Remove comments
 125                 $lines = preg_replace( '/#.*$/', '', $lines );
 126                 // Remove extra spaces
 127                 $lines = array_map( 'trim', $lines );
 128
 129                 // Remove empty lines
 130                 return array_filter( $lines );
 131         }
 132
 133         /**
 134          * Gets a list of heading to ignore.
 135          *
 136          * @return string[]
 137          */
 138         private function getIgnoredHeadings() {
 139                 static $ignoredHeadings = null;
 140                 if ( $ignoredHeadings === null ) {
 141                         $ignoredHeadings = [];
 142                         $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
 143                         if ( !$source->isDisabled() ) {
 144                                 $lines = self::parseSettingsInMessage( $source->plain() );
 145                                 // Now we just have headings!
 146                                 $ignoredHeadings = $lines;
 147                         }
 148                 }
 149
 150                 return $ignoredHeadings;
 151         }
 152
 153         /**
 154          * Extract parts of the text - opening, main and auxiliary.
 155          */
 156         private function extractWikitextParts() {
 157                 if ( $this->allText !== null ) {
 158                         return;
 159                 }
 160                 $text = $this->parserOutput->getRawText();
 161                 if ( $text === '' ) {
 162                         $this->allText = "";
 163
 164                         // empty text - nothing to seek here
 165                         return;
 166                 }
 167
 168                 $this->openingText = $this->extractTextBeforeFirstHeading( $text );
 169
 170                 $formatter = new HtmlFormatter( $text );
 171
 172                 // Strip elements from the page that we never want in the search text.
 173                 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
 174                 $formatter->filterContent();
 175
 176                 // Strip elements from the page that are auxiliary text.  These will still be
 177                 // searched, but matches will be ranked lower and non-auxiliary matches will be
 178                 // preferred in highlighting.
 179                 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
 180                 $auxiliaryElements = $formatter->filterContent();
 181                 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 182                 foreach ( $auxiliaryElements as $auxiliaryElement ) {
 183                         $this->auxText[] =
 184                                 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
 185                 }
 186         }
 187
 188         /**
 189          * Get text before first heading.
 190          *
 191          * @param string $text
 192          *
 193          * @return string|null
 194          */
 195         private function extractTextBeforeFirstHeading( $text ) {
 196                 $matches = [];
 197                 if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
 198                         // There isn't a first heading, so we interpret this as the article
 199                         // being entirely without heading.
 200                         return null;
 201                 }
 202                 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 203                 if ( !$text ) {
 204                         // There isn't any text before the first heading, so we declare there isn't
 205                         // a first heading.
 206                         return null;
 207                 }
 208
 209                 $formatter = new HtmlFormatter( $text );
 210                 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
 211                 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
 212                 $formatter->filterContent();
 213                 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 214
 215                 if ( !$text ) {
 216                         // There isn't any text after filtering before the first heading, so we declare
 217                         // that there isn't a first heading.
 218                         return null;
 219                 }
 220
 221                 return $text;
 222         }
 223
 224         /**
 225          * @return string|null
 226          */
 227         public function getOpeningText() {
 228                 $this->extractWikitextParts();
 229
 230                 return $this->openingText;
 231         }
 232
 233         /**
 234          * @return string
 235          */
 236         public function getMainText() {
 237                 $this->extractWikitextParts();
 238
 239                 return $this->allText;
 240         }
 241
 242         /**
 243          * @return string[]
 244          */
 245         public function getAuxiliaryText() {
 246                 $this->extractWikitextParts();
 247
 248                 return $this->auxText;
 249         }
 250
 251         /**
 252          * Get the "defaultsort" property
 253          *
 254          * @return string|null
 255          */
 256         public function getDefaultSort() {
 257                 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
 258                 if ( $sort === false ) {
 259                         return null;
 260                 }
 261
 262                 return $sort;
 263         }
 264 }
 265
 266 /** @deprecated class alias since 1.43 */
 267 class_alias( WikiTextStructure::class, 'WikiTextStructure' );