followup r91869: validate id chars for incoming prefs tabs in hash ([\w-]+ is suffici...
[mediawiki.git] / includes / MagicWord.php
blobd15793802b4d45118c88ef74b8569c2ffa5bb94d
1 <?php
2 /**
3 * File for magic words
5 * See docs/magicword.txt
7 * @file
8 * @ingroup Parser
9 */
11 /**
12 * This class encapsulates "magic words" such as #redirect, __NOTOC__, etc.
13 * Usage:
14 * if (MagicWord::get( 'redirect' )->match( $text ) )
16 * Possible future improvements:
17 * * Simultaneous searching for a number of magic words
18 * * MagicWord::$mObjects in shared memory
20 * Please avoid reading the data out of one of these objects and then writing
21 * special case code. If possible, add another match()-like function here.
23 * To add magic words in an extension, use the LanguageGetMagic hook. For
24 * magic words which are also Parser variables, add a MagicWordwgVariableIDs
25 * hook. Use string keys.
27 * @ingroup Parser
29 class MagicWord {
30 /**#@+
31 * @private
33 var $mId, $mSynonyms, $mCaseSensitive;
34 var $mRegex = '';
35 var $mRegexStart = '';
36 var $mBaseRegex = '';
37 var $mVariableRegex = '';
38 var $mVariableStartToEndRegex = '';
39 var $mModified = false;
40 var $mFound = false;
42 static public $mVariableIDsInitialised = false;
43 static public $mVariableIDs = array(
44 'currentmonth',
45 'currentmonth1',
46 'currentmonthname',
47 'currentmonthnamegen',
48 'currentmonthabbrev',
49 'currentday',
50 'currentday2',
51 'currentdayname',
52 'currentyear',
53 'currenttime',
54 'currenthour',
55 'localmonth',
56 'localmonth1',
57 'localmonthname',
58 'localmonthnamegen',
59 'localmonthabbrev',
60 'localday',
61 'localday2',
62 'localdayname',
63 'localyear',
64 'localtime',
65 'localhour',
66 'numberofarticles',
67 'numberoffiles',
68 'numberofedits',
69 'articlepath',
70 'sitename',
71 'server',
72 'servername',
73 'scriptpath',
74 'stylepath',
75 'pagename',
76 'pagenamee',
77 'fullpagename',
78 'fullpagenamee',
79 'namespace',
80 'namespacee',
81 'currentweek',
82 'currentdow',
83 'localweek',
84 'localdow',
85 'revisionid',
86 'revisionday',
87 'revisionday2',
88 'revisionmonth',
89 'revisionmonth1',
90 'revisionyear',
91 'revisiontimestamp',
92 'revisionuser',
93 'subpagename',
94 'subpagenamee',
95 'talkspace',
96 'talkspacee',
97 'subjectspace',
98 'subjectspacee',
99 'talkpagename',
100 'talkpagenamee',
101 'subjectpagename',
102 'subjectpagenamee',
103 'numberofusers',
104 'numberofactiveusers',
105 'numberofpages',
106 'currentversion',
107 'basepagename',
108 'basepagenamee',
109 'currenttimestamp',
110 'localtimestamp',
111 'directionmark',
112 'contentlanguage',
113 'numberofadmins',
114 'numberofviews',
117 /* Array of caching hints for ParserCache */
118 static public $mCacheTTLs = array (
119 'currentmonth' => 86400,
120 'currentmonth1' => 86400,
121 'currentmonthname' => 86400,
122 'currentmonthnamegen' => 86400,
123 'currentmonthabbrev' => 86400,
124 'currentday' => 3600,
125 'currentday2' => 3600,
126 'currentdayname' => 3600,
127 'currentyear' => 86400,
128 'currenttime' => 3600,
129 'currenthour' => 3600,
130 'localmonth' => 86400,
131 'localmonth1' => 86400,
132 'localmonthname' => 86400,
133 'localmonthnamegen' => 86400,
134 'localmonthabbrev' => 86400,
135 'localday' => 3600,
136 'localday2' => 3600,
137 'localdayname' => 3600,
138 'localyear' => 86400,
139 'localtime' => 3600,
140 'localhour' => 3600,
141 'numberofarticles' => 3600,
142 'numberoffiles' => 3600,
143 'numberofedits' => 3600,
144 'currentweek' => 3600,
145 'currentdow' => 3600,
146 'localweek' => 3600,
147 'localdow' => 3600,
148 'numberofusers' => 3600,
149 'numberofactiveusers' => 3600,
150 'numberofpages' => 3600,
151 'currentversion' => 86400,
152 'currenttimestamp' => 3600,
153 'localtimestamp' => 3600,
154 'pagesinnamespace' => 3600,
155 'numberofadmins' => 3600,
156 'numberofviews' => 3600,
157 'numberingroup' => 3600,
160 static public $mDoubleUnderscoreIDs = array(
161 'notoc',
162 'nogallery',
163 'forcetoc',
164 'toc',
165 'noeditsection',
166 'newsectionlink',
167 'nonewsectionlink',
168 'hiddencat',
169 'index',
170 'noindex',
171 'staticredirect',
172 'notitleconvert',
173 'nocontentconvert',
176 static public $mSubstIDs = array(
177 'subst',
178 'safesubst',
181 static public $mObjects = array();
182 static public $mDoubleUnderscoreArray = null;
184 /**#@-*/
186 function __construct($id = 0, $syn = array(), $cs = false) {
187 $this->mId = $id;
188 $this->mSynonyms = (array)$syn;
189 $this->mCaseSensitive = $cs;
193 * Factory: creates an object representing an ID
195 * @param $id
197 * @return MagicWord
199 static function &get( $id ) {
200 if ( !isset( self::$mObjects[$id] ) ) {
201 $mw = new MagicWord();
202 $mw->load( $id );
203 self::$mObjects[$id] = $mw;
205 return self::$mObjects[$id];
209 * Get an array of parser variable IDs
211 * @return array
213 static function getVariableIDs() {
214 if ( !self::$mVariableIDsInitialised ) {
215 # Deprecated constant definition hook, available for extensions that need it
216 $magicWords = array();
217 wfRunHooks( 'MagicWordMagicWords', array( &$magicWords ) );
218 foreach ( $magicWords as $word ) {
219 define( $word, $word );
222 # Get variable IDs
223 wfRunHooks( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
224 self::$mVariableIDsInitialised = true;
226 return self::$mVariableIDs;
230 * Get an array of parser substitution modifier IDs
232 static function getSubstIDs() {
233 return self::$mSubstIDs;
237 * Allow external reads of TTL array
239 * @return array
241 static function getCacheTTL($id) {
242 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
243 return self::$mCacheTTLs[$id];
244 } else {
245 return -1;
250 * Get a MagicWordArray of double-underscore entities
252 * @return MagicWordArray
254 static function getDoubleUnderscoreArray() {
255 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
256 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
258 return self::$mDoubleUnderscoreArray;
262 * Clear the self::$mObjects variable
263 * For use in parser tests
265 public static function clearCache() {
266 self::$mObjects = array();
270 * Initialises this object with an ID
272 * @param $id
274 function load( $id ) {
275 global $wgContLang;
276 wfProfileIn( __METHOD__ );
277 $this->mId = $id;
278 $wgContLang->getMagic( $this );
279 if ( !$this->mSynonyms ) {
280 $this->mSynonyms = array( 'dkjsagfjsgashfajsh' );
281 #throw new MWException( "Error: invalid magic word '$id'" );
282 wfDebugLog( 'exception', "Error: invalid magic word '$id'\n" );
284 wfProfileOut( __METHOD__ );
288 * Preliminary initialisation
289 * @private
291 function initRegex() {
292 // Sort the synonyms by length, descending, so that the longest synonym
293 // matches in precedence to the shortest
294 $synonyms = $this->mSynonyms;
295 usort( $synonyms, array( $this, 'compareStringLength' ) );
297 $escSyn = array();
298 foreach ( $synonyms as $synonym )
299 // In case a magic word contains /, like that's going to happen;)
300 $escSyn[] = preg_quote( $synonym, '/' );
301 $this->mBaseRegex = implode( '|', $escSyn );
303 $case = $this->mCaseSensitive ? '' : 'iu';
304 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
305 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
306 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
307 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
308 "/^(?:{$this->mBaseRegex})$/{$case}" );
312 * A comparison function that returns -1, 0 or 1 depending on whether the
313 * first string is longer, the same length or shorter than the second
314 * string.
316 * @param $s1 string
317 * @param $s2 string
319 * @return int
321 function compareStringLength( $s1, $s2 ) {
322 $l1 = strlen( $s1 );
323 $l2 = strlen( $s2 );
324 if ( $l1 < $l2 ) {
325 return 1;
326 } elseif ( $l1 > $l2 ) {
327 return -1;
328 } else {
329 return 0;
334 * Gets a regex representing matching the word
336 * @return string
338 function getRegex() {
339 if ($this->mRegex == '' ) {
340 $this->initRegex();
342 return $this->mRegex;
346 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
347 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
348 * the complete expression
350 * @return string
352 function getRegexCase() {
353 if ( $this->mRegex === '' )
354 $this->initRegex();
356 return $this->mCaseSensitive ? '' : 'iu';
360 * Gets a regex matching the word, if it is at the string start
362 * @return string
364 function getRegexStart() {
365 if ($this->mRegex == '' ) {
366 $this->initRegex();
368 return $this->mRegexStart;
372 * regex without the slashes and what not
374 * @return string
376 function getBaseRegex() {
377 if ($this->mRegex == '') {
378 $this->initRegex();
380 return $this->mBaseRegex;
384 * Returns true if the text contains the word
386 * @paran $text string
388 * @return bool
390 function match( $text ) {
391 return (bool)preg_match( $this->getRegex(), $text );
395 * Returns true if the text starts with the word
397 * @param $text string
399 * @return bool
401 function matchStart( $text ) {
402 return (bool)preg_match( $this->getRegexStart(), $text );
406 * Returns NULL if there's no match, the value of $1 otherwise
407 * The return code is the matched string, if there's no variable
408 * part in the regex and the matched variable part ($1) if there
409 * is one.
411 * @param $text string
413 * @return string
415 function matchVariableStartToEnd( $text ) {
416 $matches = array();
417 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
418 if ( $matchcount == 0 ) {
419 return null;
420 } else {
421 # multiple matched parts (variable match); some will be empty because of
422 # synonyms. The variable will be the second non-empty one so remove any
423 # blank elements and re-sort the indices.
424 # See also bug 6526
426 $matches = array_values(array_filter($matches));
428 if ( count($matches) == 1 ) {
429 return $matches[0];
430 } else {
431 return $matches[1];
438 * Returns true if the text matches the word, and alters the
439 * input string, removing all instances of the word
441 * @param $text string
443 * @return bool
445 function matchAndRemove( &$text ) {
446 $this->mFound = false;
447 $text = preg_replace_callback( $this->getRegex(), array( &$this, 'pregRemoveAndRecord' ), $text );
448 return $this->mFound;
452 * @param $text
453 * @return bool
455 function matchStartAndRemove( &$text ) {
456 $this->mFound = false;
457 $text = preg_replace_callback( $this->getRegexStart(), array( &$this, 'pregRemoveAndRecord' ), $text );
458 return $this->mFound;
462 * Used in matchAndRemove()
464 * @return string
466 function pregRemoveAndRecord() {
467 $this->mFound = true;
468 return '';
472 * Replaces the word with something else
474 * @param $replacement
475 * @param $subject
476 * @param $limit int
478 * @return string
480 function replace( $replacement, $subject, $limit = -1 ) {
481 $res = preg_replace( $this->getRegex(), StringUtils::escapeRegexReplacement( $replacement ), $subject, $limit );
482 $this->mModified = !($res === $subject);
483 return $res;
487 * Variable handling: {{SUBST:xxx}} style words
488 * Calls back a function to determine what to replace xxx with
489 * Input word must contain $1
491 * @param $text string
492 * @param $callback
494 * @return string
496 function substituteCallback( $text, $callback ) {
497 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
498 $this->mModified = !($res === $text);
499 return $res;
503 * Matches the word, where $1 is a wildcard
505 * @return string
507 function getVariableRegex() {
508 if ( $this->mVariableRegex == '' ) {
509 $this->initRegex();
511 return $this->mVariableRegex;
515 * Matches the entire string, where $1 is a wildcard
517 * @return string
519 function getVariableStartToEndRegex() {
520 if ( $this->mVariableStartToEndRegex == '' ) {
521 $this->initRegex();
523 return $this->mVariableStartToEndRegex;
527 * Accesses the synonym list directly
529 * @param $i int
531 * @return string
533 function getSynonym( $i ) {
534 return $this->mSynonyms[$i];
538 * @return array
540 function getSynonyms() {
541 return $this->mSynonyms;
545 * Returns true if the last call to replace() or substituteCallback()
546 * returned a modified text, otherwise false.
548 * @return bool
550 function getWasModified(){
551 return $this->mModified;
555 * $magicarr is an associative array of (magic word ID => replacement)
556 * This method uses the php feature to do several replacements at the same time,
557 * thereby gaining some efficiency. The result is placed in the out variable
558 * $result. The return value is true if something was replaced.
559 * @todo Should this be static? It doesn't seem to be used at all
561 * @param $magicarr
562 * @param $subject
563 * @param $result
565 * @return bool
567 function replaceMultiple( $magicarr, $subject, &$result ){
568 $search = array();
569 $replace = array();
570 foreach( $magicarr as $id => $replacement ){
571 $mw = MagicWord::get( $id );
572 $search[] = $mw->getRegex();
573 $replace[] = $replacement;
576 $result = preg_replace( $search, $replace, $subject );
577 return !($result === $subject);
581 * Adds all the synonyms of this MagicWord to an array, to allow quick
582 * lookup in a list of magic words
584 * @param $array
585 * @param $value
587 function addToArray( &$array, $value ) {
588 global $wgContLang;
589 foreach ( $this->mSynonyms as $syn ) {
590 $array[$wgContLang->lc($syn)] = $value;
595 * @return bool
597 function isCaseSensitive() {
598 return $this->mCaseSensitive;
602 * @return int
604 function getId() {
605 return $this->mId;
610 * Class for handling an array of magic words
611 * @ingroup Parser
613 class MagicWordArray {
614 var $names = array();
615 var $hash;
616 var $baseRegex, $regex;
617 var $matches;
619 function __construct( $names = array() ) {
620 $this->names = $names;
624 * Add a magic word by name
626 * @param $name string
628 public function add( $name ) {
629 $this->names[] = $name;
630 $this->hash = $this->baseRegex = $this->regex = null;
634 * Add a number of magic words by name
636 * $param $names array
638 public function addArray( $names ) {
639 $this->names = array_merge( $this->names, array_values( $names ) );
640 $this->hash = $this->baseRegex = $this->regex = null;
644 * Get a 2-d hashtable for this array
646 function getHash() {
647 if ( is_null( $this->hash ) ) {
648 global $wgContLang;
649 $this->hash = array( 0 => array(), 1 => array() );
650 foreach ( $this->names as $name ) {
651 $magic = MagicWord::get( $name );
652 $case = intval( $magic->isCaseSensitive() );
653 foreach ( $magic->getSynonyms() as $syn ) {
654 if ( !$case ) {
655 $syn = $wgContLang->lc( $syn );
657 $this->hash[$case][$syn] = $name;
661 return $this->hash;
665 * Get the base regex
667 function getBaseRegex() {
668 if ( is_null( $this->baseRegex ) ) {
669 $this->baseRegex = array( 0 => '', 1 => '' );
670 foreach ( $this->names as $name ) {
671 $magic = MagicWord::get( $name );
672 $case = intval( $magic->isCaseSensitive() );
673 foreach ( $magic->getSynonyms() as $i => $syn ) {
674 $group = "(?P<{$i}_{$name}>" . preg_quote( $syn, '/' ) . ')';
675 if ( $this->baseRegex[$case] === '' ) {
676 $this->baseRegex[$case] = $group;
677 } else {
678 $this->baseRegex[$case] .= '|' . $group;
683 return $this->baseRegex;
687 * Get an unanchored regex that does not match parameters
689 function getRegex() {
690 if ( is_null( $this->regex ) ) {
691 $base = $this->getBaseRegex();
692 $this->regex = array( '', '' );
693 if ( $this->baseRegex[0] !== '' ) {
694 $this->regex[0] = "/{$base[0]}/iuS";
696 if ( $this->baseRegex[1] !== '' ) {
697 $this->regex[1] = "/{$base[1]}/S";
700 return $this->regex;
704 * Get a regex for matching variables with parameters
706 * @return string
708 function getVariableRegex() {
709 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
713 * Get a regex anchored to the start of the string that does not match parameters
715 * @return string
717 function getRegexStart() {
718 $base = $this->getBaseRegex();
719 $newRegex = array( '', '' );
720 if ( $base[0] !== '' ) {
721 $newRegex[0] = "/^(?:{$base[0]})/iuS";
723 if ( $base[1] !== '' ) {
724 $newRegex[1] = "/^(?:{$base[1]})/S";
726 return $newRegex;
730 * Get an anchored regex for matching variables with parameters
732 * @return string
734 function getVariableStartToEndRegex() {
735 $base = $this->getBaseRegex();
736 $newRegex = array( '', '' );
737 if ( $base[0] !== '' ) {
738 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
740 if ( $base[1] !== '' ) {
741 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
743 return $newRegex;
747 * Parse a match array from preg_match
748 * Returns array(magic word ID, parameter value)
749 * If there is no parameter value, that element will be false.
751 * @param $m arrray
753 * @return array
755 function parseMatch( $m ) {
756 reset( $m );
757 while ( list( $key, $value ) = each( $m ) ) {
758 if ( $key === 0 || $value === '' ) {
759 continue;
761 $parts = explode( '_', $key, 2 );
762 if ( count( $parts ) != 2 ) {
763 // This shouldn't happen
764 // continue;
765 throw new MWException( __METHOD__ . ': bad parameter name' );
767 list( /* $synIndex */, $magicName ) = $parts;
768 $paramValue = next( $m );
769 return array( $magicName, $paramValue );
771 // This shouldn't happen either
772 throw new MWException( __METHOD__.': parameter not found' );
776 * Match some text, with parameter capture
777 * Returns an array with the magic word name in the first element and the
778 * parameter in the second element.
779 * Both elements are false if there was no match.
781 * @param $text string
783 * @return array
785 public function matchVariableStartToEnd( $text ) {
786 $regexes = $this->getVariableStartToEndRegex();
787 foreach ( $regexes as $regex ) {
788 if ( $regex !== '' ) {
789 $m = false;
790 if ( preg_match( $regex, $text, $m ) ) {
791 return $this->parseMatch( $m );
795 return array( false, false );
799 * Match some text, without parameter capture
800 * Returns the magic word name, or false if there was no capture
802 * @param $text string
804 * @return string|false
806 public function matchStartToEnd( $text ) {
807 $hash = $this->getHash();
808 if ( isset( $hash[1][$text] ) ) {
809 return $hash[1][$text];
811 global $wgContLang;
812 $lc = $wgContLang->lc( $text );
813 if ( isset( $hash[0][$lc] ) ) {
814 return $hash[0][$lc];
816 return false;
820 * Returns an associative array, ID => param value, for all items that match
821 * Removes the matched items from the input string (passed by reference)
823 * @param $text string
825 * @return array
827 public function matchAndRemove( &$text ) {
828 $found = array();
829 $regexes = $this->getRegex();
830 foreach ( $regexes as $regex ) {
831 if ( $regex === '' ) {
832 continue;
834 preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
835 foreach ( $matches as $m ) {
836 list( $name, $param ) = $this->parseMatch( $m );
837 $found[$name] = $param;
839 $text = preg_replace( $regex, '', $text );
841 return $found;
845 * Return the ID of the magic word at the start of $text, and remove
846 * the prefix from $text.
847 * Return false if no match found and $text is not modified.
848 * Does not match parameters.
850 * @param $text string
852 * @return int|false
854 public function matchStartAndRemove( &$text ) {
855 $regexes = $this->getRegexStart();
856 foreach ( $regexes as $regex ) {
857 if ( $regex === '' ) {
858 continue;
860 if ( preg_match( $regex, $text, $m ) ) {
861 list( $id, ) = $this->parseMatch( $m );
862 if ( strlen( $m[0] ) >= strlen( $text ) ) {
863 $text = '';
864 } else {
865 $text = substr( $text, strlen( $m[0] ) );
867 return $id;
870 return false;