Move FSFile classes to /fsfile
[mediawiki.git] / includes / content / WikiTextStructure.php
blobf4a6dc6a9416f0df29c2d1046a2355d0ae787b47
1 <?php
3 use HtmlFormatter\HtmlFormatter;
5 /**
6 * Class allowing to explore structure of parsed wikitext.
7 */
8 class WikiTextStructure {
9 /**
10 * @var string
12 private $openingText;
13 /**
14 * @var string
16 private $allText;
17 /**
18 * @var string[]
20 private $auxText = [];
21 /**
22 * @var ParserOutput
24 private $parserOutput;
26 /**
27 * @var string[] selectors to elements that are excluded entirely from search
29 private $excludedElementSelectors = [
30 'audio', 'video', // "it looks like you don't have javascript enabled..."
31 // do not need to index
32 'sup.reference', // The [1] for references
33 '.mw-cite-backlink', // The ↑ next to references in the references section
34 'h1', 'h2', 'h3', // Headings are already indexed in their own field.
35 'h5', 'h6', 'h4',
36 '.autocollapse', // Collapsed fields are hidden by default so we don't want them
37 // showing up.
40 /**
41 * @var string[] selectors to elements that are considered auxiliary to article text for search
43 private $auxiliaryElementSelectors = [
44 '.thumbcaption', // Thumbnail captions aren't really part of the text proper
45 'table', // Neither are tables
46 '.rellink', // Common style for "See also:".
47 '.dablink', // Common style for calling out helpful links at the top
48 // of the article.
49 '.searchaux', // New class users can use to mark stuff as auxiliary to searches.
52 /**
53 * WikiTextStructure constructor.
54 * @param ParserOutput $parserOutput
56 public function __construct( ParserOutput $parserOutput ) {
57 $this->parserOutput = $parserOutput;
60 /**
61 * Get headings on the page.
62 * @return string[]
63 * First strip out things that look like references. We can't use HTML filtering because
64 * the references come back as <sup> tags without a class. To keep from breaking stuff like
65 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
66 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
67 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
68 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
69 * reference. And since the data looks like:
70 * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
71 * we can not really use HtmlFormatter as we have no suitable selector.
73 public function headings() {
74 $headings = [];
75 $ignoredHeadings = $this->getIgnoredHeadings();
76 foreach ( $this->parserOutput->getSections() as $heading ) {
77 $heading = $heading[ 'line' ];
79 // Some wikis wrap the brackets in a span:
80 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
81 $heading = preg_replace( '/<\/?span>/', '', $heading );
82 // Normalize [] so the following regexp would work.
83 $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
84 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
86 // Strip tags from the heading or else we'll display them (escaped) in search results
87 $heading = trim( Sanitizer::stripAllTags( $heading ) );
89 // Note that we don't take the level of the heading into account - all headings are equal.
90 // Except the ones we ignore.
91 if ( !in_array( $heading, $ignoredHeadings ) ) {
92 $headings[] = $heading;
95 return $headings;
98 /**
99 * Parse a message content into an array. This function is generally used to
100 * parse settings stored as i18n messages (see search-ignored-headings).
102 * @param string $message
103 * @return string[]
105 public static function parseSettingsInMessage( $message ) {
106 $lines = explode( "\n", $message );
107 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
108 $lines = array_map( 'trim', $lines ); // Remove extra spaces
109 $lines = array_filter( $lines ); // Remove empty lines
110 return $lines;
114 * Get list of heading to ignore.
115 * @return string[]
117 private function getIgnoredHeadings() {
118 static $ignoredHeadings = null;
119 if ( $ignoredHeadings === null ) {
120 $ignoredHeadings = [];
121 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
122 if ( $source->isBlank() ) {
123 // Try old version too, just in case
124 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
126 if ( !$source->isDisabled() ) {
127 $lines = self::parseSettingsInMessage( $source->plain() );
128 $ignoredHeadings = $lines; // Now we just have headings!
131 return $ignoredHeadings;
135 * Extract parts of the text - opening, main and auxiliary.
137 private function extractWikitextParts() {
138 if ( !is_null( $this->allText ) ) {
139 return;
141 $this->parserOutput->setEditSectionTokens( false );
142 $this->parserOutput->setTOCEnabled( false );
143 $text = $this->parserOutput->getText();
144 if ( strlen( $text ) == 0 ) {
145 $this->allText = "";
146 // empty text - nothing to seek here
147 return;
149 $opening = null;
151 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
153 // Add extra spacing around break tags so text crammed together like<br>this
154 // doesn't make one word.
155 $text = str_replace( '<br', "\n<br", $text );
157 $formatter = new HtmlFormatter( $text );
159 // Strip elements from the page that we never want in the search text.
160 $formatter->remove( $this->excludedElementSelectors );
161 $formatter->filterContent();
163 // Strip elements from the page that are auxiliary text. These will still be
164 // searched but matches will be ranked lower and non-auxiliary matches will be
165 // preferred in highlighting.
166 $formatter->remove( $this->auxiliaryElementSelectors );
167 $auxiliaryElements = $formatter->filterContent();
168 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
169 foreach ( $auxiliaryElements as $auxiliaryElement ) {
170 $this->auxText[] =
171 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
176 * Get text before first heading.
177 * @param string $text
178 * @return string|null
180 private function extractHeadingBeforeFirstHeading( $text ) {
181 $matches = [];
182 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
183 // There isn't a first heading so we interpret this as the article
184 // being entirely without heading.
185 return null;
187 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
188 if ( !$text ) {
189 // There isn't any text before the first heading so we declare there isn't
190 // a first heading.
191 return null;
194 $formatter = new HtmlFormatter( $text );
195 $formatter->remove( $this->excludedElementSelectors );
196 $formatter->remove( $this->auxiliaryElementSelectors );
197 $formatter->filterContent();
198 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
200 if ( !$text ) {
201 // There isn't any text after filtering before the first heading so we declare
202 // that there isn't a first heading.
203 return null;
206 return $text;
210 * Get opening text
211 * @return string
213 public function getOpeningText() {
214 $this->extractWikitextParts();
215 return $this->openingText;
219 * Get main text
220 * @return string
222 public function getMainText() {
223 $this->extractWikitextParts();
224 return $this->allText;
228 * Get auxiliary text
229 * @return string[]
231 public function getAuxiliaryText() {
232 $this->extractWikitextParts();
233 return $this->auxText;
237 * Get the defaultsort property
238 * @return string|null
240 public function getDefaultSort() {
241 return $this->parserOutput->getProperty( 'defaultsort' );