Mark ParserOptions suppressSectionEditLinks as safe to cache
[mediawiki.git] / includes / parser / MagicWordArray.php
blob1103259dbf2a695be8fd381ad4b1c7d3be92847a
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
18 * @file
21 namespace MediaWiki\Parser;
23 use LogicException;
24 use MediaWiki\Logger\LoggerFactory;
25 use MediaWiki\MediaWikiServices;
27 /**
28 * Class for handling an array of magic words
30 * See docs/magicword.md.
32 * @since 1.11
33 * @ingroup Parser
35 class MagicWordArray {
37 /** @var string[] */
38 public $names = [];
39 private MagicWordFactory $factory;
41 /** @var array<int,array<string,string>>|null */
42 private $hash;
44 /** @var string[]|null */
45 private $baseRegex;
47 /** @var string[]|null */
48 private $regex;
50 /**
51 * @param string[] $names
52 * @param MagicWordFactory|null $factory
54 public function __construct( $names = [], MagicWordFactory $factory = null ) {
55 $this->names = $names;
56 $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
59 /**
60 * Add a magic word by name
62 * @param string $name
64 public function add( $name ): void {
65 $this->names[] = $name;
66 $this->hash = $this->baseRegex = $this->regex = null;
69 /**
70 * Get a 2-d hashtable for this array
72 * @return array<int,array<string,string>>
74 public function getHash(): array {
75 if ( $this->hash === null ) {
76 $this->hash = [ 0 => [], 1 => [] ];
77 foreach ( $this->names as $name ) {
78 $magic = $this->factory->get( $name );
79 $case = intval( $magic->isCaseSensitive() );
80 foreach ( $magic->getSynonyms() as $syn ) {
81 if ( !$case ) {
82 $syn = $this->factory->getContentLanguage()->lc( $syn );
84 $this->hash[$case][$syn] = $name;
88 return $this->hash;
91 /**
92 * Get the base regex
94 * @internal For use in {@see Parser} only
95 * @param bool $capture Set to false to suppress the capture groups,
96 * which can cause unexpected conflicts when this regexp is embedded in
97 * other regexps with similar constructs.
98 * @param string $delimiter The delimiter which will be used for the
99 * eventual regexp.
100 * @return array<int,string>
102 public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array {
103 if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
104 return $this->baseRegex;
106 $regex = [ 0 => [], 1 => [] ];
107 foreach ( $this->names as $name ) {
108 $magic = $this->factory->get( $name );
109 $case = $magic->isCaseSensitive() ? 1 : 0;
110 foreach ( $magic->getSynonyms() as $i => $syn ) {
111 if ( $capture ) {
112 // Group name must start with a non-digit in PCRE 8.34+
113 $it = strtr( $i, '0123456789', 'abcdefghij' );
114 $groupName = $it . '_' . $name;
115 $group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
116 $regex[$case][] = $group;
117 } else {
118 $regex[$case][] = preg_quote( $syn, $delimiter );
122 '@phan-var array<int,string[]> $regex';
123 foreach ( $regex as $case => &$re ) {
124 $re = count( $re ) ? implode( '|', $re ) : '(?!)';
125 if ( !$case ) {
126 $re = "(?i:{$re})";
129 '@phan-var array<int,string> $regex';
131 if ( $capture && $delimiter === '/' ) {
132 $this->baseRegex = $regex;
134 return $regex;
138 * Get an unanchored regex that does not match parameters
140 * @return array<int,string>
142 private function getRegex(): array {
143 if ( $this->regex === null ) {
144 $this->regex = [];
145 $base = $this->getBaseRegex( true, '/' );
146 foreach ( $base as $case => $re ) {
147 $this->regex[$case] = "/$re/JS";
149 // As a performance optimization, turn on unicode mode only for
150 // case-insensitive matching.
151 $this->regex[0] .= 'u';
153 return $this->regex;
157 * Get a regex anchored to the start of the string that does not match parameters
159 * @return array<int,string>
161 private function getRegexStart(): array {
162 $newRegex = [];
163 $base = $this->getBaseRegex( true, '/' );
164 foreach ( $base as $case => $re ) {
165 $newRegex[$case] = "/^(?:$re)/JS";
167 // As a performance optimization, turn on unicode mode only for
168 // case-insensitive matching.
169 $newRegex[0] .= 'u';
170 return $newRegex;
174 * Get an anchored regex for matching variables with parameters
176 * @return array<int,string>
178 private function getVariableStartToEndRegex(): array {
179 $newRegex = [];
180 $base = $this->getBaseRegex( true, '/' );
181 foreach ( $base as $case => $re ) {
182 $newRegex[$case] = str_replace( '\$1', '(.*?)', "/^(?:$re)$/JS" );
184 // As a performance optimization, turn on unicode mode only for
185 // case-insensitive matching.
186 $newRegex[0] .= 'u';
187 return $newRegex;
191 * @since 1.20
192 * @return string[]
194 public function getNames() {
195 return $this->names;
199 * Parse a match array from preg_match
201 * @param array<string|int,string> $matches
202 * @return array{0:string,1:string|false} Pair of (magic word ID, parameter value),
203 * where the latter is instead false if there is no parameter value.
205 private function parseMatch( array $matches ): array {
206 $magicName = null;
207 foreach ( $matches as $key => $match ) {
208 if ( $magicName !== null ) {
209 // The structure we found at this point is [ …,
210 // 'a_magicWordName' => 'matchedSynonym',
211 // n => 'matchedSynonym (again)',
212 // n + 1 => 'parameterValue',
213 // … ]
214 return [ $magicName, $matches[$key + 1] ?? false ];
216 // Skip the initial full match and any non-matching group
217 if ( $match !== '' && $key !== 0 ) {
218 $parts = explode( '_', $key, 2 );
219 if ( !isset( $parts[1] ) ) {
220 throw new LogicException( 'Unexpected group name' );
222 $magicName = $parts[1];
225 throw new LogicException( 'Unexpected $m array with no match' );
229 * Match some text, with parameter capture
231 * @param string $text
232 * @return (string|false)[] Magic word name in the first element and the parameter in the second
233 * element. Both elements are false if there was no match.
235 public function matchVariableStartToEnd( $text ): array {
236 $regexes = $this->getVariableStartToEndRegex();
237 foreach ( $regexes as $regex ) {
238 $m = [];
239 if ( preg_match( $regex, $text, $m ) ) {
240 return $this->parseMatch( $m );
243 return [ false, false ];
247 * Match some text, without parameter capture
249 * @see MagicWord::matchStartToEnd
250 * @param string $text
251 * @return string|false The magic word name, or false if there was no capture
253 public function matchStartToEnd( $text ) {
254 $hash = $this->getHash();
255 if ( isset( $hash[1][$text] ) ) {
256 return $hash[1][$text];
258 $lc = $this->factory->getContentLanguage()->lc( $text );
259 return $hash[0][$lc] ?? false;
263 * Return an associative array for all items that match.
265 * Cannot be used for magic words with parameters.
266 * Removes the matched items from the input string (passed by reference)
268 * @see MagicWord::matchAndRemove
269 * @param string &$text
270 * @return array<string,false> Keyed by magic word ID
272 public function matchAndRemove( &$text ): array {
273 $found = [];
274 $regexes = $this->getRegex();
275 foreach ( $regexes as $regex ) {
276 $matches = [];
277 $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
278 if ( $res === false ) {
279 $error = preg_last_error();
280 $errorText = preg_last_error_msg();
281 LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [
282 'code' => $error,
283 'regex' => $regex,
284 'text' => $text,
285 'errorText' => $errorText
286 ] );
287 // T321234: Don't try to fix old revisions with broken UTF-8, just return as is
288 if ( $error === PREG_BAD_UTF8_ERROR ) {
289 continue;
291 throw new LogicException( "preg_match_all error $error: $errorText" );
292 } elseif ( $res ) {
293 foreach ( $matches as $m ) {
294 [ $name, $param ] = $this->parseMatch( $m );
295 $found[$name] = $param;
298 $res = preg_replace( $regex, '', $text );
299 if ( $res === null ) {
300 $error = preg_last_error();
301 $errorText = preg_last_error_msg();
302 LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace error: {code} {errorText}', [
303 'code' => $error,
304 'regex' => $regex,
305 'text' => $text,
306 'errorText' => $errorText
307 ] );
308 throw new LogicException( "preg_replace error $error: $errorText" );
310 $text = $res;
312 return $found;
316 * Return the ID of the magic word at the start of $text, and remove
317 * the prefix from $text.
319 * Does not match parameters.
321 * @see MagicWord::matchStartAndRemove
322 * @param string &$text Unmodified if no match is found.
323 * @return string|false False if no match is found.
325 public function matchStartAndRemove( &$text ) {
326 $regexes = $this->getRegexStart();
327 foreach ( $regexes as $regex ) {
328 if ( preg_match( $regex, $text, $m ) ) {
329 [ $id, ] = $this->parseMatch( $m );
330 if ( strlen( $m[0] ) >= strlen( $text ) ) {
331 $text = '';
332 } else {
333 $text = substr( $text, strlen( $m[0] ) );
335 return $id;
338 return false;
342 /** @deprecated class alias since 1.40 */
343 class_alias( MagicWordArray::class, 'MagicWordArray' );