Localisation updates from https://translatewiki.net.
[mediawiki.git] / includes / content / WikiTextStructure.php
blob09c8136673294ae337ea8a6379d6205b1555fdd7
1 <?php
3 namespace MediaWiki\Content;
5 use HtmlFormatter\HtmlFormatter;
6 use MediaWiki\Parser\ParserOutput;
7 use MediaWiki\Parser\Sanitizer;
9 /**
10 * Class allowing to explore the structure of parsed wikitext.
12 class WikiTextStructure {
14 private ?string $openingText = null;
15 private ?string $allText = null;
16 /** @var string[] */
17 private array $auxText = [];
18 private ParserOutput $parserOutput;
20 /**
21 * Selectors to elements that are excluded entirely from search
23 private const EXCLUDED_ELEMENT_SELECTORS = [
24 // "it looks like you don't have javascript enabled..." – do not need to index
25 'audio', 'video',
26 // CSS stylesheets aren't content
27 'style',
28 // The [1] for references from Cite
29 'sup.reference',
30 // The ↑ next to references in the references section from Cite
31 '.mw-cite-backlink',
32 // Headings are already indexed in their own field.
33 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
34 // Collapsed fields are hidden by default, so we don't want them showing up.
35 '.autocollapse',
36 // Content explicitly decided to be not searchable by editors such
37 // as custom navigation templates.
38 '.navigation-not-searchable',
39 // User-facing interface code prompting the user to act from WikibaseMediaInfo
40 '.wbmi-entityview-emptyCaption',
43 /**
44 * Selectors to elements that are considered auxiliary to the article text for search
46 private const AUXILIARY_ELEMENT_SELECTORS = [
47 // Thumbnail captions aren't really part of the text proper
48 '.thumbcaption',
49 'figcaption',
50 // Neither are tables
51 'table',
52 // Common style for "See also:".
53 '.rellink',
54 // Common style for calling out helpful links at the top of the article.
55 '.dablink',
56 // New class users can use to mark stuff as auxiliary to searches.
57 '.searchaux',
60 /**
61 * @param ParserOutput $parserOutput
63 public function __construct( ParserOutput $parserOutput ) {
64 $this->parserOutput = $parserOutput;
67 /**
68 * Gets headings from the page.
70 * @return string[]
71 * First strip out things that look like references. We can't use HTML filtering because
72 * the references come back as <sup> tags without a class. To keep from breaking stuff like
73 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
74 * we don't remove the whole <sup> tag.
76 * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
77 * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
79 * So we only strip things that look like <sup> tags wrapping a reference. And since the data
80 * looks like:
81 * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
82 * we can not really use HtmlFormatter as we have no suitable selector.
84 public function headings() {
85 $headings = [];
86 $tocData = $this->parserOutput->getTOCData();
87 if ( $tocData === null ) {
88 return $headings;
90 $ignoredHeadings = $this->getIgnoredHeadings();
91 foreach ( $tocData->getSections() as $heading ) {
92 $heading = $heading->line;
94 // Some wikis wrap the brackets in a span:
95 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
96 $heading = preg_replace( '/<\/?span>/', '', $heading );
97 // Normalize [] so the following regexp would work.
98 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
99 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
101 // Strip tags from the heading or else we'll display them (escaped) in search results
102 $heading = trim( Sanitizer::stripAllTags( $heading ) );
104 // Note that we don't take the level of the heading into account - all headings are equal.
105 // Except the ones we ignore.
106 if ( !in_array( $heading, $ignoredHeadings ) ) {
107 $headings[] = $heading;
111 return $headings;
115 * Parse a message content into an array. This function is generally used to
116 * parse settings stored as i18n messages (see search-ignored-headings).
118 * @param string $message
120 * @return string[]
122 public static function parseSettingsInMessage( $message ) {
123 $lines = explode( "\n", $message );
124 // Remove comments
125 $lines = preg_replace( '/#.*$/', '', $lines );
126 // Remove extra spaces
127 $lines = array_map( 'trim', $lines );
129 // Remove empty lines
130 return array_filter( $lines );
134 * Gets a list of heading to ignore.
136 * @return string[]
138 private function getIgnoredHeadings() {
139 static $ignoredHeadings = null;
140 if ( $ignoredHeadings === null ) {
141 $ignoredHeadings = [];
142 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
143 if ( !$source->isDisabled() ) {
144 $lines = self::parseSettingsInMessage( $source->plain() );
145 // Now we just have headings!
146 $ignoredHeadings = $lines;
150 return $ignoredHeadings;
154 * Extract parts of the text - opening, main and auxiliary.
156 private function extractWikitextParts() {
157 if ( $this->allText !== null ) {
158 return;
160 $text = $this->parserOutput->getRawText();
161 if ( $text === '' ) {
162 $this->allText = "";
164 // empty text - nothing to seek here
165 return;
168 $this->openingText = $this->extractTextBeforeFirstHeading( $text );
170 $formatter = new HtmlFormatter( $text );
172 // Strip elements from the page that we never want in the search text.
173 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
174 $formatter->filterContent();
176 // Strip elements from the page that are auxiliary text. These will still be
177 // searched, but matches will be ranked lower and non-auxiliary matches will be
178 // preferred in highlighting.
179 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
180 $auxiliaryElements = $formatter->filterContent();
181 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
182 foreach ( $auxiliaryElements as $auxiliaryElement ) {
183 $this->auxText[] =
184 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
189 * Get text before first heading.
191 * @param string $text
193 * @return string|null
195 private function extractTextBeforeFirstHeading( $text ) {
196 $matches = [];
197 if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
198 // There isn't a first heading, so we interpret this as the article
199 // being entirely without heading.
200 return null;
202 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
203 if ( !$text ) {
204 // There isn't any text before the first heading, so we declare there isn't
205 // a first heading.
206 return null;
209 $formatter = new HtmlFormatter( $text );
210 $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
211 $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
212 $formatter->filterContent();
213 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
215 if ( !$text ) {
216 // There isn't any text after filtering before the first heading, so we declare
217 // that there isn't a first heading.
218 return null;
221 return $text;
225 * @return string|null
227 public function getOpeningText() {
228 $this->extractWikitextParts();
230 return $this->openingText;
234 * @return string
236 public function getMainText() {
237 $this->extractWikitextParts();
239 return $this->allText;
243 * @return string[]
245 public function getAuxiliaryText() {
246 $this->extractWikitextParts();
248 return $this->auxText;
252 * Get the "defaultsort" property
254 * @return string|null
256 public function getDefaultSort() {
257 $sort = $this->parserOutput->getPageProperty( 'defaultsort' );
258 if ( $sort === false ) {
259 return null;
262 return $sort;
266 /** @deprecated class alias since 1.43 */
267 class_alias( WikiTextStructure::class, 'WikiTextStructure' );