Enhanced RC: Optimization of the initial collapsing
[mediawiki.git] / includes / MagicWord.php
blobadb2ab77471941da3ab2bbcdbe21cb400377a90a
1 <?php
2 /**
3 * File for magic words.
5 * See docs/magicword.txt.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup Parser
26 /**
27 * This class encapsulates "magic words" such as "#redirect", __NOTOC__, etc.
29 * @par Usage:
30 * @code
31 * if (MagicWord::get( 'redirect' )->match( $text ) ) {
32 * // some code
33 * }
34 * @endcode
36 * Possible future improvements:
37 * * Simultaneous searching for a number of magic words
38 * * MagicWord::$mObjects in shared memory
40 * Please avoid reading the data out of one of these objects and then writing
41 * special case code. If possible, add another match()-like function here.
43 * To add magic words in an extension, use $magicWords in a file listed in
44 * $wgExtensionMessagesFiles[].
46 * @par Example:
47 * @code
48 * $magicWords = array();
50 * $magicWords['en'] = array(
51 * 'magicwordkey' => array( 0, 'case_insensitive_magic_word' ),
52 * 'magicwordkey2' => array( 1, 'CASE_sensitive_magic_word2' ),
53 * );
54 * @endcode
56 * For magic words which are also Parser variables, add a MagicWordwgVariableIDs
57 * hook. Use string keys.
59 * @ingroup Parser
61 class MagicWord {
62 /**#@+
63 * @private
65 var $mId, $mSynonyms, $mCaseSensitive;
66 var $mRegex = '';
67 var $mRegexStart = '';
68 var $mBaseRegex = '';
69 var $mVariableRegex = '';
70 var $mVariableStartToEndRegex = '';
71 var $mModified = false;
72 var $mFound = false;
74 static public $mVariableIDsInitialised = false;
75 static public $mVariableIDs = array(
76 'currentmonth',
77 'currentmonth1',
78 'currentmonthname',
79 'currentmonthnamegen',
80 'currentmonthabbrev',
81 'currentday',
82 'currentday2',
83 'currentdayname',
84 'currentyear',
85 'currenttime',
86 'currenthour',
87 'localmonth',
88 'localmonth1',
89 'localmonthname',
90 'localmonthnamegen',
91 'localmonthabbrev',
92 'localday',
93 'localday2',
94 'localdayname',
95 'localyear',
96 'localtime',
97 'localhour',
98 'numberofarticles',
99 'numberoffiles',
100 'numberofedits',
101 'articlepath',
102 'pageid',
103 'sitename',
104 'server',
105 'servername',
106 'scriptpath',
107 'stylepath',
108 'pagename',
109 'pagenamee',
110 'fullpagename',
111 'fullpagenamee',
112 'namespace',
113 'namespacee',
114 'namespacenumber',
115 'currentweek',
116 'currentdow',
117 'localweek',
118 'localdow',
119 'revisionid',
120 'revisionday',
121 'revisionday2',
122 'revisionmonth',
123 'revisionmonth1',
124 'revisionyear',
125 'revisiontimestamp',
126 'revisionuser',
127 'subpagename',
128 'subpagenamee',
129 'talkspace',
130 'talkspacee',
131 'subjectspace',
132 'subjectspacee',
133 'talkpagename',
134 'talkpagenamee',
135 'subjectpagename',
136 'subjectpagenamee',
137 'numberofusers',
138 'numberofactiveusers',
139 'numberofpages',
140 'currentversion',
141 'rootpagename',
142 'rootpagenamee',
143 'basepagename',
144 'basepagenamee',
145 'currenttimestamp',
146 'localtimestamp',
147 'directionmark',
148 'contentlanguage',
149 'numberofadmins',
150 'numberofviews',
153 /* Array of caching hints for ParserCache */
154 static public $mCacheTTLs = array(
155 'currentmonth' => 86400,
156 'currentmonth1' => 86400,
157 'currentmonthname' => 86400,
158 'currentmonthnamegen' => 86400,
159 'currentmonthabbrev' => 86400,
160 'currentday' => 3600,
161 'currentday2' => 3600,
162 'currentdayname' => 3600,
163 'currentyear' => 86400,
164 'currenttime' => 3600,
165 'currenthour' => 3600,
166 'localmonth' => 86400,
167 'localmonth1' => 86400,
168 'localmonthname' => 86400,
169 'localmonthnamegen' => 86400,
170 'localmonthabbrev' => 86400,
171 'localday' => 3600,
172 'localday2' => 3600,
173 'localdayname' => 3600,
174 'localyear' => 86400,
175 'localtime' => 3600,
176 'localhour' => 3600,
177 'numberofarticles' => 3600,
178 'numberoffiles' => 3600,
179 'numberofedits' => 3600,
180 'currentweek' => 3600,
181 'currentdow' => 3600,
182 'localweek' => 3600,
183 'localdow' => 3600,
184 'numberofusers' => 3600,
185 'numberofactiveusers' => 3600,
186 'numberofpages' => 3600,
187 'currentversion' => 86400,
188 'currenttimestamp' => 3600,
189 'localtimestamp' => 3600,
190 'pagesinnamespace' => 3600,
191 'numberofadmins' => 3600,
192 'numberofviews' => 3600,
193 'numberingroup' => 3600,
196 static public $mDoubleUnderscoreIDs = array(
197 'notoc',
198 'nogallery',
199 'forcetoc',
200 'toc',
201 'noeditsection',
202 'newsectionlink',
203 'nonewsectionlink',
204 'hiddencat',
205 'index',
206 'noindex',
207 'staticredirect',
208 'notitleconvert',
209 'nocontentconvert',
212 static public $mSubstIDs = array(
213 'subst',
214 'safesubst',
217 static public $mObjects = array();
218 static public $mDoubleUnderscoreArray = null;
220 /**#@-*/
222 function __construct( $id = 0, $syn = array(), $cs = false ) {
223 $this->mId = $id;
224 $this->mSynonyms = (array)$syn;
225 $this->mCaseSensitive = $cs;
229 * Factory: creates an object representing an ID
231 * @param $id
233 * @return MagicWord
235 static function &get( $id ) {
236 if ( !isset( self::$mObjects[$id] ) ) {
237 $mw = new MagicWord();
238 $mw->load( $id );
239 self::$mObjects[$id] = $mw;
241 return self::$mObjects[$id];
245 * Get an array of parser variable IDs
247 * @return array
249 static function getVariableIDs() {
250 if ( !self::$mVariableIDsInitialised ) {
251 # Get variable IDs
252 wfRunHooks( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
253 self::$mVariableIDsInitialised = true;
255 return self::$mVariableIDs;
259 * Get an array of parser substitution modifier IDs
260 * @return array
262 static function getSubstIDs() {
263 return self::$mSubstIDs;
267 * Allow external reads of TTL array
269 * @param $id int
270 * @return array
272 static function getCacheTTL( $id ) {
273 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
274 return self::$mCacheTTLs[$id];
275 } else {
276 return -1;
281 * Get a MagicWordArray of double-underscore entities
283 * @return MagicWordArray
285 static function getDoubleUnderscoreArray() {
286 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
287 wfRunHooks( 'GetDoubleUnderscoreIDs', array( &self::$mDoubleUnderscoreIDs ) );
288 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
290 return self::$mDoubleUnderscoreArray;
294 * Clear the self::$mObjects variable
295 * For use in parser tests
297 public static function clearCache() {
298 self::$mObjects = array();
302 * Initialises this object with an ID
304 * @param $id
305 * @throws MWException
307 function load( $id ) {
308 global $wgContLang;
309 wfProfileIn( __METHOD__ );
310 $this->mId = $id;
311 $wgContLang->getMagic( $this );
312 if ( !$this->mSynonyms ) {
313 $this->mSynonyms = array( 'brionmademeputthishere' );
314 wfProfileOut( __METHOD__ );
315 throw new MWException( "Error: invalid magic word '$id'" );
317 wfProfileOut( __METHOD__ );
321 * Preliminary initialisation
322 * @private
324 function initRegex() {
325 // Sort the synonyms by length, descending, so that the longest synonym
326 // matches in precedence to the shortest
327 $synonyms = $this->mSynonyms;
328 usort( $synonyms, array( $this, 'compareStringLength' ) );
330 $escSyn = array();
331 foreach ( $synonyms as $synonym ) {
332 // In case a magic word contains /, like that's going to happen;)
333 $escSyn[] = preg_quote( $synonym, '/' );
335 $this->mBaseRegex = implode( '|', $escSyn );
337 $case = $this->mCaseSensitive ? '' : 'iu';
338 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
339 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
340 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
341 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
342 "/^(?:{$this->mBaseRegex})$/{$case}" );
346 * A comparison function that returns -1, 0 or 1 depending on whether the
347 * first string is longer, the same length or shorter than the second
348 * string.
350 * @param $s1 string
351 * @param $s2 string
353 * @return int
355 function compareStringLength( $s1, $s2 ) {
356 $l1 = strlen( $s1 );
357 $l2 = strlen( $s2 );
358 if ( $l1 < $l2 ) {
359 return 1;
360 } elseif ( $l1 > $l2 ) {
361 return -1;
362 } else {
363 return 0;
368 * Gets a regex representing matching the word
370 * @return string
372 function getRegex() {
373 if ( $this->mRegex == '' ) {
374 $this->initRegex();
376 return $this->mRegex;
380 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
381 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
382 * the complete expression
384 * @return string
386 function getRegexCase() {
387 if ( $this->mRegex === '' ) {
388 $this->initRegex();
391 return $this->mCaseSensitive ? '' : 'iu';
395 * Gets a regex matching the word, if it is at the string start
397 * @return string
399 function getRegexStart() {
400 if ( $this->mRegex == '' ) {
401 $this->initRegex();
403 return $this->mRegexStart;
407 * regex without the slashes and what not
409 * @return string
411 function getBaseRegex() {
412 if ( $this->mRegex == '' ) {
413 $this->initRegex();
415 return $this->mBaseRegex;
419 * Returns true if the text contains the word
421 * @param $text string
423 * @return bool
425 function match( $text ) {
426 return (bool)preg_match( $this->getRegex(), $text );
430 * Returns true if the text starts with the word
432 * @param $text string
434 * @return bool
436 function matchStart( $text ) {
437 return (bool)preg_match( $this->getRegexStart(), $text );
441 * Returns NULL if there's no match, the value of $1 otherwise
442 * The return code is the matched string, if there's no variable
443 * part in the regex and the matched variable part ($1) if there
444 * is one.
446 * @param $text string
448 * @return string
450 function matchVariableStartToEnd( $text ) {
451 $matches = array();
452 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
453 if ( $matchcount == 0 ) {
454 return null;
455 } else {
456 # multiple matched parts (variable match); some will be empty because of
457 # synonyms. The variable will be the second non-empty one so remove any
458 # blank elements and re-sort the indices.
459 # See also bug 6526
461 $matches = array_values( array_filter( $matches ) );
463 if ( count( $matches ) == 1 ) {
464 return $matches[0];
465 } else {
466 return $matches[1];
472 * Returns true if the text matches the word, and alters the
473 * input string, removing all instances of the word
475 * @param $text string
477 * @return bool
479 function matchAndRemove( &$text ) {
480 $this->mFound = false;
481 $text = preg_replace_callback( $this->getRegex(), array( &$this, 'pregRemoveAndRecord' ), $text );
482 return $this->mFound;
486 * @param $text
487 * @return bool
489 function matchStartAndRemove( &$text ) {
490 $this->mFound = false;
491 $text = preg_replace_callback( $this->getRegexStart(), array( &$this, 'pregRemoveAndRecord' ), $text );
492 return $this->mFound;
496 * Used in matchAndRemove()
498 * @return string
500 function pregRemoveAndRecord() {
501 $this->mFound = true;
502 return '';
506 * Replaces the word with something else
508 * @param $replacement
509 * @param $subject
510 * @param $limit int
512 * @return string
514 function replace( $replacement, $subject, $limit = -1 ) {
515 $res = preg_replace( $this->getRegex(), StringUtils::escapeRegexReplacement( $replacement ), $subject, $limit );
516 $this->mModified = $res !== $subject;
517 return $res;
521 * Variable handling: {{SUBST:xxx}} style words
522 * Calls back a function to determine what to replace xxx with
523 * Input word must contain $1
525 * @param $text string
526 * @param $callback
528 * @return string
530 function substituteCallback( $text, $callback ) {
531 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
532 $this->mModified = $res !== $text;
533 return $res;
537 * Matches the word, where $1 is a wildcard
539 * @return string
541 function getVariableRegex() {
542 if ( $this->mVariableRegex == '' ) {
543 $this->initRegex();
545 return $this->mVariableRegex;
549 * Matches the entire string, where $1 is a wildcard
551 * @return string
553 function getVariableStartToEndRegex() {
554 if ( $this->mVariableStartToEndRegex == '' ) {
555 $this->initRegex();
557 return $this->mVariableStartToEndRegex;
561 * Accesses the synonym list directly
563 * @param $i int
565 * @return string
567 function getSynonym( $i ) {
568 return $this->mSynonyms[$i];
572 * @return array
574 function getSynonyms() {
575 return $this->mSynonyms;
579 * Returns true if the last call to replace() or substituteCallback()
580 * returned a modified text, otherwise false.
582 * @return bool
584 function getWasModified() {
585 return $this->mModified;
589 * $magicarr is an associative array of (magic word ID => replacement)
590 * This method uses the php feature to do several replacements at the same time,
591 * thereby gaining some efficiency. The result is placed in the out variable
592 * $result. The return value is true if something was replaced.
593 * @todo Should this be static? It doesn't seem to be used at all
595 * @param $magicarr
596 * @param $subject
597 * @param $result
599 * @return bool
601 function replaceMultiple( $magicarr, $subject, &$result ) {
602 $search = array();
603 $replace = array();
604 foreach ( $magicarr as $id => $replacement ) {
605 $mw = MagicWord::get( $id );
606 $search[] = $mw->getRegex();
607 $replace[] = $replacement;
610 $result = preg_replace( $search, $replace, $subject );
611 return $result !== $subject;
615 * Adds all the synonyms of this MagicWord to an array, to allow quick
616 * lookup in a list of magic words
618 * @param $array
619 * @param $value
621 function addToArray( &$array, $value ) {
622 global $wgContLang;
623 foreach ( $this->mSynonyms as $syn ) {
624 $array[$wgContLang->lc( $syn )] = $value;
629 * @return bool
631 function isCaseSensitive() {
632 return $this->mCaseSensitive;
636 * @return int
638 function getId() {
639 return $this->mId;
644 * Class for handling an array of magic words
645 * @ingroup Parser
647 class MagicWordArray {
648 var $names = array();
649 var $hash;
650 var $baseRegex, $regex;
651 var $matches;
654 * @param $names array
656 function __construct( $names = array() ) {
657 $this->names = $names;
661 * Add a magic word by name
663 * @param $name string
665 public function add( $name ) {
666 $this->names[] = $name;
667 $this->hash = $this->baseRegex = $this->regex = null;
671 * Add a number of magic words by name
673 * @param $names array
675 public function addArray( $names ) {
676 $this->names = array_merge( $this->names, array_values( $names ) );
677 $this->hash = $this->baseRegex = $this->regex = null;
681 * Get a 2-d hashtable for this array
683 function getHash() {
684 if ( is_null( $this->hash ) ) {
685 global $wgContLang;
686 $this->hash = array( 0 => array(), 1 => array() );
687 foreach ( $this->names as $name ) {
688 $magic = MagicWord::get( $name );
689 $case = intval( $magic->isCaseSensitive() );
690 foreach ( $magic->getSynonyms() as $syn ) {
691 if ( !$case ) {
692 $syn = $wgContLang->lc( $syn );
694 $this->hash[$case][$syn] = $name;
698 return $this->hash;
702 * Get the base regex
704 function getBaseRegex() {
705 if ( is_null( $this->baseRegex ) ) {
706 $this->baseRegex = array( 0 => '', 1 => '' );
707 foreach ( $this->names as $name ) {
708 $magic = MagicWord::get( $name );
709 $case = intval( $magic->isCaseSensitive() );
710 foreach ( $magic->getSynonyms() as $i => $syn ) {
711 $group = "(?P<{$i}_{$name}>" . preg_quote( $syn, '/' ) . ')';
712 if ( $this->baseRegex[$case] === '' ) {
713 $this->baseRegex[$case] = $group;
714 } else {
715 $this->baseRegex[$case] .= '|' . $group;
720 return $this->baseRegex;
724 * Get an unanchored regex that does not match parameters
726 function getRegex() {
727 if ( is_null( $this->regex ) ) {
728 $base = $this->getBaseRegex();
729 $this->regex = array( '', '' );
730 if ( $this->baseRegex[0] !== '' ) {
731 $this->regex[0] = "/{$base[0]}/iuS";
733 if ( $this->baseRegex[1] !== '' ) {
734 $this->regex[1] = "/{$base[1]}/S";
737 return $this->regex;
741 * Get a regex for matching variables with parameters
743 * @return string
745 function getVariableRegex() {
746 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
750 * Get a regex anchored to the start of the string that does not match parameters
752 * @return array
754 function getRegexStart() {
755 $base = $this->getBaseRegex();
756 $newRegex = array( '', '' );
757 if ( $base[0] !== '' ) {
758 $newRegex[0] = "/^(?:{$base[0]})/iuS";
760 if ( $base[1] !== '' ) {
761 $newRegex[1] = "/^(?:{$base[1]})/S";
763 return $newRegex;
767 * Get an anchored regex for matching variables with parameters
769 * @return array
771 function getVariableStartToEndRegex() {
772 $base = $this->getBaseRegex();
773 $newRegex = array( '', '' );
774 if ( $base[0] !== '' ) {
775 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
777 if ( $base[1] !== '' ) {
778 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
780 return $newRegex;
784 * @since 1.20
785 * @return array
787 public function getNames() {
788 return $this->names;
792 * Parse a match array from preg_match
793 * Returns array(magic word ID, parameter value)
794 * If there is no parameter value, that element will be false.
796 * @param $m array
798 * @throws MWException
799 * @return array
801 function parseMatch( $m ) {
802 reset( $m );
803 while ( list( $key, $value ) = each( $m ) ) {
804 if ( $key === 0 || $value === '' ) {
805 continue;
807 $parts = explode( '_', $key, 2 );
808 if ( count( $parts ) != 2 ) {
809 // This shouldn't happen
810 // continue;
811 throw new MWException( __METHOD__ . ': bad parameter name' );
813 list( /* $synIndex */, $magicName ) = $parts;
814 $paramValue = next( $m );
815 return array( $magicName, $paramValue );
817 // This shouldn't happen either
818 throw new MWException( __METHOD__ . ': parameter not found' );
822 * Match some text, with parameter capture
823 * Returns an array with the magic word name in the first element and the
824 * parameter in the second element.
825 * Both elements are false if there was no match.
827 * @param $text string
829 * @return array
831 public function matchVariableStartToEnd( $text ) {
832 $regexes = $this->getVariableStartToEndRegex();
833 foreach ( $regexes as $regex ) {
834 if ( $regex !== '' ) {
835 $m = array();
836 if ( preg_match( $regex, $text, $m ) ) {
837 return $this->parseMatch( $m );
841 return array( false, false );
845 * Match some text, without parameter capture
846 * Returns the magic word name, or false if there was no capture
848 * @param $text string
850 * @return string|bool False on failure
852 public function matchStartToEnd( $text ) {
853 $hash = $this->getHash();
854 if ( isset( $hash[1][$text] ) ) {
855 return $hash[1][$text];
857 global $wgContLang;
858 $lc = $wgContLang->lc( $text );
859 if ( isset( $hash[0][$lc] ) ) {
860 return $hash[0][$lc];
862 return false;
866 * Returns an associative array, ID => param value, for all items that match
867 * Removes the matched items from the input string (passed by reference)
869 * @param $text string
871 * @return array
873 public function matchAndRemove( &$text ) {
874 $found = array();
875 $regexes = $this->getRegex();
876 foreach ( $regexes as $regex ) {
877 if ( $regex === '' ) {
878 continue;
880 preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
881 foreach ( $matches as $m ) {
882 list( $name, $param ) = $this->parseMatch( $m );
883 $found[$name] = $param;
885 $text = preg_replace( $regex, '', $text );
887 return $found;
891 * Return the ID of the magic word at the start of $text, and remove
892 * the prefix from $text.
893 * Return false if no match found and $text is not modified.
894 * Does not match parameters.
896 * @param $text string
898 * @return int|bool False on failure
900 public function matchStartAndRemove( &$text ) {
901 $regexes = $this->getRegexStart();
902 foreach ( $regexes as $regex ) {
903 if ( $regex === '' ) {
904 continue;
906 if ( preg_match( $regex, $text, $m ) ) {
907 list( $id, ) = $this->parseMatch( $m );
908 if ( strlen( $m[0] ) >= strlen( $text ) ) {
909 $text = '';
910 } else {
911 $text = substr( $text, strlen( $m[0] ) );
913 return $id;
916 return false;