mediawiki.userSuggest: Use formatversion=2 for API request
[mediawiki.git] / includes / MagicWord.php
blob80e60d2a34acdd3252e519fbab98327e441be617
1 <?php
2 /**
3 * File for magic words.
5 * See docs/magicword.txt.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup Parser
26 use MediaWiki\Logger\LoggerFactory;
28 /**
29 * This class encapsulates "magic words" such as "#redirect", __NOTOC__, etc.
31 * @par Usage:
32 * @code
33 * if (MagicWord::get( 'redirect' )->match( $text ) ) {
34 * // some code
35 * }
36 * @endcode
38 * Possible future improvements:
39 * * Simultaneous searching for a number of magic words
40 * * MagicWord::$mObjects in shared memory
42 * Please avoid reading the data out of one of these objects and then writing
43 * special case code. If possible, add another match()-like function here.
45 * To add magic words in an extension, use $magicWords in a file listed in
46 * $wgExtensionMessagesFiles[].
48 * @par Example:
49 * @code
50 * $magicWords = array();
52 * $magicWords['en'] = array(
53 * 'magicwordkey' => array( 0, 'case_insensitive_magic_word' ),
54 * 'magicwordkey2' => array( 1, 'CASE_sensitive_magic_word2' ),
55 * );
56 * @endcode
58 * For magic words which are also Parser variables, add a MagicWordwgVariableIDs
59 * hook. Use string keys.
61 * @ingroup Parser
63 class MagicWord {
64 /**#@-*/
66 /** @var int */
67 public $mId;
69 /** @var array */
70 public $mSynonyms;
72 /** @var bool */
73 public $mCaseSensitive;
75 /** @var string */
76 private $mRegex = '';
78 /** @var string */
79 private $mRegexStart = '';
81 /** @var string */
82 private $mRegexStartToEnd = '';
84 /** @var string */
85 private $mBaseRegex = '';
87 /** @var string */
88 private $mVariableRegex = '';
90 /** @var string */
91 private $mVariableStartToEndRegex = '';
93 /** @var bool */
94 private $mModified = false;
96 /** @var bool */
97 private $mFound = false;
99 static public $mVariableIDsInitialised = false;
100 static public $mVariableIDs = array(
101 '!',
102 'currentmonth',
103 'currentmonth1',
104 'currentmonthname',
105 'currentmonthnamegen',
106 'currentmonthabbrev',
107 'currentday',
108 'currentday2',
109 'currentdayname',
110 'currentyear',
111 'currenttime',
112 'currenthour',
113 'localmonth',
114 'localmonth1',
115 'localmonthname',
116 'localmonthnamegen',
117 'localmonthabbrev',
118 'localday',
119 'localday2',
120 'localdayname',
121 'localyear',
122 'localtime',
123 'localhour',
124 'numberofarticles',
125 'numberoffiles',
126 'numberofedits',
127 'articlepath',
128 'pageid',
129 'sitename',
130 'server',
131 'servername',
132 'scriptpath',
133 'stylepath',
134 'pagename',
135 'pagenamee',
136 'fullpagename',
137 'fullpagenamee',
138 'namespace',
139 'namespacee',
140 'namespacenumber',
141 'currentweek',
142 'currentdow',
143 'localweek',
144 'localdow',
145 'revisionid',
146 'revisionday',
147 'revisionday2',
148 'revisionmonth',
149 'revisionmonth1',
150 'revisionyear',
151 'revisiontimestamp',
152 'revisionuser',
153 'revisionsize',
154 'subpagename',
155 'subpagenamee',
156 'talkspace',
157 'talkspacee',
158 'subjectspace',
159 'subjectspacee',
160 'talkpagename',
161 'talkpagenamee',
162 'subjectpagename',
163 'subjectpagenamee',
164 'numberofusers',
165 'numberofactiveusers',
166 'numberofpages',
167 'currentversion',
168 'rootpagename',
169 'rootpagenamee',
170 'basepagename',
171 'basepagenamee',
172 'currenttimestamp',
173 'localtimestamp',
174 'directionmark',
175 'contentlanguage',
176 'numberofadmins',
177 'cascadingsources',
180 /* Array of caching hints for ParserCache */
181 static public $mCacheTTLs = array(
182 'currentmonth' => 86400,
183 'currentmonth1' => 86400,
184 'currentmonthname' => 86400,
185 'currentmonthnamegen' => 86400,
186 'currentmonthabbrev' => 86400,
187 'currentday' => 3600,
188 'currentday2' => 3600,
189 'currentdayname' => 3600,
190 'currentyear' => 86400,
191 'currenttime' => 3600,
192 'currenthour' => 3600,
193 'localmonth' => 86400,
194 'localmonth1' => 86400,
195 'localmonthname' => 86400,
196 'localmonthnamegen' => 86400,
197 'localmonthabbrev' => 86400,
198 'localday' => 3600,
199 'localday2' => 3600,
200 'localdayname' => 3600,
201 'localyear' => 86400,
202 'localtime' => 3600,
203 'localhour' => 3600,
204 'numberofarticles' => 3600,
205 'numberoffiles' => 3600,
206 'numberofedits' => 3600,
207 'currentweek' => 3600,
208 'currentdow' => 3600,
209 'localweek' => 3600,
210 'localdow' => 3600,
211 'numberofusers' => 3600,
212 'numberofactiveusers' => 3600,
213 'numberofpages' => 3600,
214 'currentversion' => 86400,
215 'currenttimestamp' => 3600,
216 'localtimestamp' => 3600,
217 'pagesinnamespace' => 3600,
218 'numberofadmins' => 3600,
219 'numberingroup' => 3600,
222 static public $mDoubleUnderscoreIDs = array(
223 'notoc',
224 'nogallery',
225 'forcetoc',
226 'toc',
227 'noeditsection',
228 'newsectionlink',
229 'nonewsectionlink',
230 'hiddencat',
231 'index',
232 'noindex',
233 'staticredirect',
234 'notitleconvert',
235 'nocontentconvert',
238 static public $mSubstIDs = array(
239 'subst',
240 'safesubst',
243 static public $mObjects = array();
244 static public $mDoubleUnderscoreArray = null;
246 /**#@-*/
248 function __construct( $id = 0, $syn = array(), $cs = false ) {
249 $this->mId = $id;
250 $this->mSynonyms = (array)$syn;
251 $this->mCaseSensitive = $cs;
255 * Factory: creates an object representing an ID
257 * @param int $id
259 * @return MagicWord
261 static function &get( $id ) {
262 if ( !isset( self::$mObjects[$id] ) ) {
263 $mw = new MagicWord();
264 $mw->load( $id );
265 self::$mObjects[$id] = $mw;
267 return self::$mObjects[$id];
271 * Get an array of parser variable IDs
273 * @return array
275 static function getVariableIDs() {
276 if ( !self::$mVariableIDsInitialised ) {
277 # Get variable IDs
278 Hooks::run( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
279 self::$mVariableIDsInitialised = true;
281 return self::$mVariableIDs;
285 * Get an array of parser substitution modifier IDs
286 * @return array
288 static function getSubstIDs() {
289 return self::$mSubstIDs;
293 * Allow external reads of TTL array
295 * @param int $id
296 * @return int
298 static function getCacheTTL( $id ) {
299 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
300 return self::$mCacheTTLs[$id];
301 } else {
302 return -1;
307 * Get a MagicWordArray of double-underscore entities
309 * @return MagicWordArray
311 static function getDoubleUnderscoreArray() {
312 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
313 Hooks::run( 'GetDoubleUnderscoreIDs', array( &self::$mDoubleUnderscoreIDs ) );
314 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
316 return self::$mDoubleUnderscoreArray;
320 * Clear the self::$mObjects variable
321 * For use in parser tests
323 public static function clearCache() {
324 self::$mObjects = array();
328 * Initialises this object with an ID
330 * @param int $id
331 * @throws MWException
333 function load( $id ) {
334 global $wgContLang;
335 $this->mId = $id;
336 $wgContLang->getMagic( $this );
337 if ( !$this->mSynonyms ) {
338 $this->mSynonyms = array( 'brionmademeputthishere' );
339 throw new MWException( "Error: invalid magic word '$id'" );
344 * Preliminary initialisation
345 * @private
347 function initRegex() {
348 // Sort the synonyms by length, descending, so that the longest synonym
349 // matches in precedence to the shortest
350 $synonyms = $this->mSynonyms;
351 usort( $synonyms, array( $this, 'compareStringLength' ) );
353 $escSyn = array();
354 foreach ( $synonyms as $synonym ) {
355 // In case a magic word contains /, like that's going to happen;)
356 $escSyn[] = preg_quote( $synonym, '/' );
358 $this->mBaseRegex = implode( '|', $escSyn );
360 $case = $this->mCaseSensitive ? '' : 'iu';
361 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
362 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
363 $this->mRegexStartToEnd = "/^(?:{$this->mBaseRegex})$/{$case}";
364 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
365 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
366 "/^(?:{$this->mBaseRegex})$/{$case}" );
370 * A comparison function that returns -1, 0 or 1 depending on whether the
371 * first string is longer, the same length or shorter than the second
372 * string.
374 * @param string $s1
375 * @param string $s2
377 * @return int
379 function compareStringLength( $s1, $s2 ) {
380 $l1 = strlen( $s1 );
381 $l2 = strlen( $s2 );
382 if ( $l1 < $l2 ) {
383 return 1;
384 } elseif ( $l1 > $l2 ) {
385 return -1;
386 } else {
387 return 0;
392 * Gets a regex representing matching the word
394 * @return string
396 function getRegex() {
397 if ( $this->mRegex == '' ) {
398 $this->initRegex();
400 return $this->mRegex;
404 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
405 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
406 * the complete expression
408 * @return string
410 function getRegexCase() {
411 if ( $this->mRegex === '' ) {
412 $this->initRegex();
415 return $this->mCaseSensitive ? '' : 'iu';
419 * Gets a regex matching the word, if it is at the string start
421 * @return string
423 function getRegexStart() {
424 if ( $this->mRegex == '' ) {
425 $this->initRegex();
427 return $this->mRegexStart;
431 * Gets a regex matching the word from start to end of a string
433 * @return string
434 * @since 1.23
436 function getRegexStartToEnd() {
437 if ( $this->mRegexStartToEnd == '' ) {
438 $this->initRegex();
440 return $this->mRegexStartToEnd;
444 * regex without the slashes and what not
446 * @return string
448 function getBaseRegex() {
449 if ( $this->mRegex == '' ) {
450 $this->initRegex();
452 return $this->mBaseRegex;
456 * Returns true if the text contains the word
458 * @param string $text
460 * @return bool
462 function match( $text ) {
463 return (bool)preg_match( $this->getRegex(), $text );
467 * Returns true if the text starts with the word
469 * @param string $text
471 * @return bool
473 function matchStart( $text ) {
474 return (bool)preg_match( $this->getRegexStart(), $text );
478 * Returns true if the text matched the word
480 * @param string $text
482 * @return bool
483 * @since 1.23
485 function matchStartToEnd( $text ) {
486 return (bool)preg_match( $this->getRegexStartToEnd(), $text );
490 * Returns NULL if there's no match, the value of $1 otherwise
491 * The return code is the matched string, if there's no variable
492 * part in the regex and the matched variable part ($1) if there
493 * is one.
495 * @param string $text
497 * @return string
499 function matchVariableStartToEnd( $text ) {
500 $matches = array();
501 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
502 if ( $matchcount == 0 ) {
503 return null;
504 } else {
505 # multiple matched parts (variable match); some will be empty because of
506 # synonyms. The variable will be the second non-empty one so remove any
507 # blank elements and re-sort the indices.
508 # See also bug 6526
510 $matches = array_values( array_filter( $matches ) );
512 if ( count( $matches ) == 1 ) {
513 return $matches[0];
514 } else {
515 return $matches[1];
521 * Returns true if the text matches the word, and alters the
522 * input string, removing all instances of the word
524 * @param string $text
526 * @return bool
528 function matchAndRemove( &$text ) {
529 $this->mFound = false;
530 $text = preg_replace_callback(
531 $this->getRegex(),
532 array( &$this, 'pregRemoveAndRecord' ),
533 $text
536 return $this->mFound;
540 * @param string $text
541 * @return bool
543 function matchStartAndRemove( &$text ) {
544 $this->mFound = false;
545 $text = preg_replace_callback(
546 $this->getRegexStart(),
547 array( &$this, 'pregRemoveAndRecord' ),
548 $text
551 return $this->mFound;
555 * Used in matchAndRemove()
557 * @return string
559 function pregRemoveAndRecord() {
560 $this->mFound = true;
561 return '';
565 * Replaces the word with something else
567 * @param string $replacement
568 * @param string $subject
569 * @param int $limit
571 * @return string
573 function replace( $replacement, $subject, $limit = -1 ) {
574 $res = preg_replace(
575 $this->getRegex(),
576 StringUtils::escapeRegexReplacement( $replacement ),
577 $subject,
578 $limit
580 $this->mModified = $res !== $subject;
581 return $res;
585 * Variable handling: {{SUBST:xxx}} style words
586 * Calls back a function to determine what to replace xxx with
587 * Input word must contain $1
589 * @param string $text
590 * @param callable $callback
592 * @return string
594 function substituteCallback( $text, $callback ) {
595 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
596 $this->mModified = $res !== $text;
597 return $res;
601 * Matches the word, where $1 is a wildcard
603 * @return string
605 function getVariableRegex() {
606 if ( $this->mVariableRegex == '' ) {
607 $this->initRegex();
609 return $this->mVariableRegex;
613 * Matches the entire string, where $1 is a wildcard
615 * @return string
617 function getVariableStartToEndRegex() {
618 if ( $this->mVariableStartToEndRegex == '' ) {
619 $this->initRegex();
621 return $this->mVariableStartToEndRegex;
625 * Accesses the synonym list directly
627 * @param int $i
629 * @return string
631 function getSynonym( $i ) {
632 return $this->mSynonyms[$i];
636 * @return array
638 function getSynonyms() {
639 return $this->mSynonyms;
643 * Returns true if the last call to replace() or substituteCallback()
644 * returned a modified text, otherwise false.
646 * @return bool
648 function getWasModified() {
649 return $this->mModified;
653 * $magicarr is an associative array of (magic word ID => replacement)
654 * This method uses the php feature to do several replacements at the same time,
655 * thereby gaining some efficiency. The result is placed in the out variable
656 * $result. The return value is true if something was replaced.
657 * @deprecated since 1.25, unused
659 * @param array $magicarr
660 * @param string $subject
661 * @param string $result
663 * @return bool
665 function replaceMultiple( $magicarr, $subject, &$result ) {
666 wfDeprecated( __METHOD__, '1.25' );
667 $search = array();
668 $replace = array();
669 foreach ( $magicarr as $id => $replacement ) {
670 $mw = MagicWord::get( $id );
671 $search[] = $mw->getRegex();
672 $replace[] = $replacement;
675 $result = preg_replace( $search, $replace, $subject );
676 return $result !== $subject;
680 * Adds all the synonyms of this MagicWord to an array, to allow quick
681 * lookup in a list of magic words
683 * @param array $array
684 * @param string $value
686 function addToArray( &$array, $value ) {
687 global $wgContLang;
688 foreach ( $this->mSynonyms as $syn ) {
689 $array[$wgContLang->lc( $syn )] = $value;
694 * @return bool
696 function isCaseSensitive() {
697 return $this->mCaseSensitive;
701 * @return int
703 function getId() {
704 return $this->mId;
709 * Class for handling an array of magic words
710 * @ingroup Parser
712 class MagicWordArray {
713 /** @var array */
714 public $names = array();
716 /** @var array */
717 private $hash;
719 private $baseRegex;
721 private $regex;
724 * @param array $names
726 function __construct( $names = array() ) {
727 $this->names = $names;
731 * Add a magic word by name
733 * @param string $name
735 public function add( $name ) {
736 $this->names[] = $name;
737 $this->hash = $this->baseRegex = $this->regex = null;
741 * Add a number of magic words by name
743 * @param array $names
745 public function addArray( $names ) {
746 $this->names = array_merge( $this->names, array_values( $names ) );
747 $this->hash = $this->baseRegex = $this->regex = null;
751 * Get a 2-d hashtable for this array
752 * @return array
754 function getHash() {
755 if ( is_null( $this->hash ) ) {
756 global $wgContLang;
757 $this->hash = array( 0 => array(), 1 => array() );
758 foreach ( $this->names as $name ) {
759 $magic = MagicWord::get( $name );
760 $case = intval( $magic->isCaseSensitive() );
761 foreach ( $magic->getSynonyms() as $syn ) {
762 if ( !$case ) {
763 $syn = $wgContLang->lc( $syn );
765 $this->hash[$case][$syn] = $name;
769 return $this->hash;
773 * Get the base regex
774 * @return array
776 function getBaseRegex() {
777 if ( is_null( $this->baseRegex ) ) {
778 $this->baseRegex = array( 0 => '', 1 => '' );
779 foreach ( $this->names as $name ) {
780 $magic = MagicWord::get( $name );
781 $case = intval( $magic->isCaseSensitive() );
782 foreach ( $magic->getSynonyms() as $i => $syn ) {
783 // Group name must start with a non-digit in PCRE 8.34+
784 $it = strtr( $i, '0123456789', 'abcdefghij' );
785 $group = "(?P<{$it}_{$name}>" . preg_quote( $syn, '/' ) . ')';
786 if ( $this->baseRegex[$case] === '' ) {
787 $this->baseRegex[$case] = $group;
788 } else {
789 $this->baseRegex[$case] .= '|' . $group;
794 return $this->baseRegex;
798 * Get an unanchored regex that does not match parameters
799 * @return array
801 function getRegex() {
802 if ( is_null( $this->regex ) ) {
803 $base = $this->getBaseRegex();
804 $this->regex = array( '', '' );
805 if ( $this->baseRegex[0] !== '' ) {
806 $this->regex[0] = "/{$base[0]}/iuS";
808 if ( $this->baseRegex[1] !== '' ) {
809 $this->regex[1] = "/{$base[1]}/S";
812 return $this->regex;
816 * Get a regex for matching variables with parameters
818 * @return string
820 function getVariableRegex() {
821 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
825 * Get a regex anchored to the start of the string that does not match parameters
827 * @return array
829 function getRegexStart() {
830 $base = $this->getBaseRegex();
831 $newRegex = array( '', '' );
832 if ( $base[0] !== '' ) {
833 $newRegex[0] = "/^(?:{$base[0]})/iuS";
835 if ( $base[1] !== '' ) {
836 $newRegex[1] = "/^(?:{$base[1]})/S";
838 return $newRegex;
842 * Get an anchored regex for matching variables with parameters
844 * @return array
846 function getVariableStartToEndRegex() {
847 $base = $this->getBaseRegex();
848 $newRegex = array( '', '' );
849 if ( $base[0] !== '' ) {
850 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
852 if ( $base[1] !== '' ) {
853 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
855 return $newRegex;
859 * @since 1.20
860 * @return array
862 public function getNames() {
863 return $this->names;
867 * Parse a match array from preg_match
868 * Returns array(magic word ID, parameter value)
869 * If there is no parameter value, that element will be false.
871 * @param array $m
873 * @throws MWException
874 * @return array
876 function parseMatch( $m ) {
877 reset( $m );
878 while ( list( $key, $value ) = each( $m ) ) {
879 if ( $key === 0 || $value === '' ) {
880 continue;
882 $parts = explode( '_', $key, 2 );
883 if ( count( $parts ) != 2 ) {
884 // This shouldn't happen
885 // continue;
886 throw new MWException( __METHOD__ . ': bad parameter name' );
888 list( /* $synIndex */, $magicName ) = $parts;
889 $paramValue = next( $m );
890 return array( $magicName, $paramValue );
892 // This shouldn't happen either
893 throw new MWException( __METHOD__ . ': parameter not found' );
897 * Match some text, with parameter capture
898 * Returns an array with the magic word name in the first element and the
899 * parameter in the second element.
900 * Both elements are false if there was no match.
902 * @param string $text
904 * @return array
906 public function matchVariableStartToEnd( $text ) {
907 $regexes = $this->getVariableStartToEndRegex();
908 foreach ( $regexes as $regex ) {
909 if ( $regex !== '' ) {
910 $m = array();
911 if ( preg_match( $regex, $text, $m ) ) {
912 return $this->parseMatch( $m );
916 return array( false, false );
920 * Match some text, without parameter capture
921 * Returns the magic word name, or false if there was no capture
923 * @param string $text
925 * @return string|bool False on failure
927 public function matchStartToEnd( $text ) {
928 $hash = $this->getHash();
929 if ( isset( $hash[1][$text] ) ) {
930 return $hash[1][$text];
932 global $wgContLang;
933 $lc = $wgContLang->lc( $text );
934 if ( isset( $hash[0][$lc] ) ) {
935 return $hash[0][$lc];
937 return false;
941 * Returns an associative array, ID => param value, for all items that match
942 * Removes the matched items from the input string (passed by reference)
944 * @param string $text
946 * @return array
948 public function matchAndRemove( &$text ) {
949 $found = array();
950 $regexes = $this->getRegex();
951 foreach ( $regexes as $regex ) {
952 if ( $regex === '' ) {
953 continue;
955 $matches = array();
956 $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
957 if ( $res === false ) {
958 LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all returned false', array(
959 'code' => preg_last_error(),
960 'regex' => $regex,
961 'text' => $text,
962 ) );
963 } elseif ( $res ) {
964 foreach ( $matches as $m ) {
965 list( $name, $param ) = $this->parseMatch( $m );
966 $found[$name] = $param;
969 $res = preg_replace( $regex, '', $text );
970 if ( $res === null ) {
971 LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace returned null', array(
972 'code' => preg_last_error(),
973 'regex' => $regex,
974 'text' => $text,
975 ) );
977 $text = $res;
979 return $found;
983 * Return the ID of the magic word at the start of $text, and remove
984 * the prefix from $text.
985 * Return false if no match found and $text is not modified.
986 * Does not match parameters.
988 * @param string $text
990 * @return int|bool False on failure
992 public function matchStartAndRemove( &$text ) {
993 $regexes = $this->getRegexStart();
994 foreach ( $regexes as $regex ) {
995 if ( $regex === '' ) {
996 continue;
998 if ( preg_match( $regex, $text, $m ) ) {
999 list( $id, ) = $this->parseMatch( $m );
1000 if ( strlen( $m[0] ) >= strlen( $text ) ) {
1001 $text = '';
1002 } else {
1003 $text = substr( $text, strlen( $m[0] ) );
1005 return $id;
1008 return false;