Update git submodules
[mediawiki.git] / includes / parser / MagicWordArray.php
blobeb5bd218f97a0c54d23f1be03957e924415945b0
1 <?php
3 /**
4 * See docs/magicword.md.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @ingroup Parser
25 namespace MediaWiki\Parser;
27 use Exception;
28 use MediaWiki\Logger\LoggerFactory;
29 use MediaWiki\MediaWikiServices;
30 use MWException;
32 /**
33 * Class for handling an array of magic words
34 * @ingroup Parser
36 class MagicWordArray {
37 /** @var string[] */
38 public $names = [];
40 /** @var MagicWordFactory */
41 private $factory;
43 /** @var array|null */
44 private $hash;
46 /** @var string[]|null */
47 private $baseRegex;
49 /** @var string[]|null */
50 private $regex;
52 /**
53 * @param string[] $names
54 * @param MagicWordFactory|null $factory
56 public function __construct( $names = [], MagicWordFactory $factory = null ) {
57 $this->names = $names;
58 $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
61 /**
62 * Add a magic word by name
64 * @param string $name
66 public function add( $name ) {
67 $this->names[] = $name;
68 $this->hash = $this->baseRegex = $this->regex = null;
71 /**
72 * Get a 2-d hashtable for this array
73 * @return array
75 public function getHash() {
76 if ( $this->hash === null ) {
77 $this->hash = [ 0 => [], 1 => [] ];
78 foreach ( $this->names as $name ) {
79 $magic = $this->factory->get( $name );
80 $case = intval( $magic->isCaseSensitive() );
81 foreach ( $magic->getSynonyms() as $syn ) {
82 if ( !$case ) {
83 $syn = $this->factory->getContentLanguage()->lc( $syn );
85 $this->hash[$case][$syn] = $name;
89 return $this->hash;
92 /**
93 * Get the base regex
94 * @param bool $capture Set to false to suppress the capture groups,
95 * which can cause unexpected conflicts when this regexp is embedded in
96 * other regexps with similar constructs.
97 * @param string $delimiter The delimiter which will be used for the
98 * eventual regexp.
99 * @return string[]
100 * @internal
102 public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array {
103 if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
104 return $this->baseRegex;
106 $regex = [ 0 => [], 1 => [] ];
107 $allGroups = [];
108 foreach ( $this->names as $name ) {
109 $magic = $this->factory->get( $name );
110 $case = $magic->isCaseSensitive() ? 1 : 0;
111 foreach ( $magic->getSynonyms() as $i => $syn ) {
112 if ( $capture ) {
113 // Group name must start with a non-digit in PCRE 8.34+
114 $it = strtr( $i, '0123456789', 'abcdefghij' );
115 $groupName = $it . '_' . $name;
116 $group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
117 // look for same group names to avoid same named subpatterns in the regex
118 if ( isset( $allGroups[$groupName] ) ) {
119 throw new MWException(
120 __METHOD__ . ': duplicate internal name in magic word array: ' . $name
123 $allGroups[$groupName] = true;
124 $regex[$case][] = $group;
125 } else {
126 $regex[$case][] = preg_quote( $syn, $delimiter );
130 '@phan-var array<int,string[]> $regex';
131 foreach ( $regex as $case => &$re ) {
132 $re = count( $re ) ? implode( '|', $re ) : '(?!)';
133 if ( !$case ) {
134 $re = "(?i:{$re})";
137 '@phan-var array<int,string> $regex';
139 if ( $capture && $delimiter === '/' ) {
140 $this->baseRegex = $regex;
142 return $regex;
146 * Get an unanchored regex that does not match parameters
147 * @return string[]
149 private function getRegex(): array {
150 if ( $this->regex === null ) {
151 $this->regex = [];
152 $base = $this->getBaseRegex( true, '/' );
153 foreach ( $base as $case => $re ) {
154 $this->regex[$case] = "/{$re}/S";
156 // As a performance optimization, turn on unicode mode only for
157 // case-insensitive matching.
158 $this->regex[0] .= 'u';
160 return $this->regex;
164 * Get a regex anchored to the start of the string that does not match parameters
166 * @return string[]
168 private function getRegexStart(): array {
169 $newRegex = [];
170 $base = $this->getBaseRegex( true, '/' );
171 foreach ( $base as $case => $re ) {
172 $newRegex[$case] = "/^(?:{$re})/S";
174 // As a performance optimization, turn on unicode mode only for
175 // case-insensitive matching.
176 $newRegex[0] .= 'u';
177 return $newRegex;
181 * Get an anchored regex for matching variables with parameters
183 * @return string[]
185 private function getVariableStartToEndRegex(): array {
186 $newRegex = [];
187 $base = $this->getBaseRegex( true, '/' );
188 foreach ( $base as $case => $re ) {
189 $newRegex[$case] = str_replace( "\\$1", "(.*?)", "/^(?:{$re})$/S" );
191 // As a performance optimization, turn on unicode mode only for
192 // case-insensitive matching.
193 $newRegex[0] .= 'u';
194 return $newRegex;
198 * @since 1.20
199 * @return string[]
201 public function getNames() {
202 return $this->names;
206 * Parse a match array from preg_match
207 * Returns array(magic word ID, parameter value)
208 * If there is no parameter value, that element will be false.
210 * @param array $m
212 * @throws MWException
213 * @return array
215 private function parseMatch( array $m ): array {
216 reset( $m );
217 while ( ( $key = key( $m ) ) !== null ) {
218 $value = current( $m );
219 next( $m );
220 if ( $key === 0 || $value === '' ) {
221 continue;
223 $parts = explode( '_', $key, 2 );
224 if ( count( $parts ) != 2 ) {
225 // This shouldn't happen
226 // continue;
227 throw new MWException( __METHOD__ . ': bad parameter name' );
229 [ /* $synIndex */, $magicName ] = $parts;
230 $paramValue = next( $m );
231 return [ $magicName, $paramValue ];
233 // This shouldn't happen either
234 throw new MWException( __METHOD__ . ': parameter not found' );
238 * Match some text, with parameter capture
239 * Returns an array with the magic word name in the first element and the
240 * parameter in the second element.
241 * Both elements are false if there was no match.
243 * @param string $text
245 * @return array
247 public function matchVariableStartToEnd( $text ) {
248 $regexes = $this->getVariableStartToEndRegex();
249 foreach ( $regexes as $regex ) {
250 $m = [];
251 if ( preg_match( $regex, $text, $m ) ) {
252 return $this->parseMatch( $m );
255 return [ false, false ];
259 * Match some text, without parameter capture
260 * Returns the magic word name, or false if there was no capture
262 * @param string $text
264 * @return string|false False on failure
266 public function matchStartToEnd( $text ) {
267 $hash = $this->getHash();
268 if ( isset( $hash[1][$text] ) ) {
269 return $hash[1][$text];
271 $lc = $this->factory->getContentLanguage()->lc( $text );
272 return $hash[0][$lc] ?? false;
276 * Returns an associative array, ID => param value, for all items that match
277 * Removes the matched items from the input string (passed by reference)
279 * @param string &$text
281 * @return array
283 public function matchAndRemove( &$text ) {
284 $found = [];
285 $regexes = $this->getRegex();
286 foreach ( $regexes as $regex ) {
287 $matches = [];
288 $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
289 if ( $res === false ) {
290 $error = preg_last_error();
291 // TODO: Remove function_exists when we require PHP8
292 $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
293 LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [
294 'code' => $error,
295 'regex' => $regex,
296 'text' => $text,
297 'errorText' => $errorText
298 ] );
299 // T321234: Don't try to fix old revisions with broken UTF-8, just return as is
300 if ( $error === PREG_BAD_UTF8_ERROR ) {
301 continue;
303 throw new Exception( "preg_match_all error $error: $errorText" );
304 } elseif ( $res ) {
305 foreach ( $matches as $m ) {
306 [ $name, $param ] = $this->parseMatch( $m );
307 $found[$name] = $param;
310 $res = preg_replace( $regex, '', $text );
311 if ( $res === null ) {
312 $error = preg_last_error();
313 // TODO: Remove function_exists when we require PHP8
314 $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
315 LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace error: {code} {errorText}', [
316 'code' => $error,
317 'regex' => $regex,
318 'text' => $text,
319 'errorText' => $errorText
320 ] );
321 throw new Exception( "preg_replace error $error: $errorText" );
323 $text = $res;
325 return $found;
329 * Return the ID of the magic word at the start of $text, and remove
330 * the prefix from $text.
331 * Return false if no match found and $text is not modified.
332 * Does not match parameters.
334 * @param string &$text
336 * @return int|bool False on failure
338 public function matchStartAndRemove( &$text ) {
339 $regexes = $this->getRegexStart();
340 foreach ( $regexes as $regex ) {
341 if ( preg_match( $regex, $text, $m ) ) {
342 [ $id, ] = $this->parseMatch( $m );
343 if ( strlen( $m[0] ) >= strlen( $text ) ) {
344 $text = '';
345 } else {
346 $text = substr( $text, strlen( $m[0] ) );
348 return $id;
351 return false;
356 * @deprecated since 1.40
358 class_alias( MagicWordArray::class, 'MagicWordArray' );