Remove "related" searches
[mediawiki.git] / includes / MagicWord.php
blob4d17298b23488e8147416322d6e3e3eab1a211fc
1 <?php
2 /**
3 * File for magic words.
5 * See docs/magicword.txt.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup Parser
26 /**
27 * This class encapsulates "magic words" such as "#redirect", __NOTOC__, etc.
29 * @par Usage:
30 * @code
31 * if (MagicWord::get( 'redirect' )->match( $text ) ) {
32 * // some code
33 * }
34 * @endcode
36 * Possible future improvements:
37 * * Simultaneous searching for a number of magic words
38 * * MagicWord::$mObjects in shared memory
40 * Please avoid reading the data out of one of these objects and then writing
41 * special case code. If possible, add another match()-like function here.
43 * To add magic words in an extension, use $magicWords in a file listed in
44 * $wgExtensionMessagesFiles[].
46 * @par Example:
47 * @code
48 * $magicWords = array();
50 * $magicWords['en'] = array(
51 * 'magicwordkey' => array( 0, 'case_insensitive_magic_word' ),
52 * 'magicwordkey2' => array( 1, 'CASE_sensitive_magic_word2' ),
53 * );
54 * @endcode
56 * For magic words which are also Parser variables, add a MagicWordwgVariableIDs
57 * hook. Use string keys.
59 * @ingroup Parser
61 class MagicWord {
62 /**#@-*/
64 /** @var int */
65 public $mId;
67 /** @var array */
68 public $mSynonyms;
70 /** @var bool */
71 public $mCaseSensitive;
73 /** @var string */
74 private $mRegex = '';
76 /** @var string */
77 private $mRegexStart = '';
79 /** @var string */
80 private $mRegexStartToEnd = '';
82 /** @var string */
83 private $mBaseRegex = '';
85 /** @var string */
86 private $mVariableRegex = '';
88 /** @var string */
89 private $mVariableStartToEndRegex = '';
91 /** @var bool */
92 private $mModified = false;
94 /** @var bool */
95 private $mFound = false;
97 static public $mVariableIDsInitialised = false;
98 static public $mVariableIDs = array(
99 '!',
100 'currentmonth',
101 'currentmonth1',
102 'currentmonthname',
103 'currentmonthnamegen',
104 'currentmonthabbrev',
105 'currentday',
106 'currentday2',
107 'currentdayname',
108 'currentyear',
109 'currenttime',
110 'currenthour',
111 'localmonth',
112 'localmonth1',
113 'localmonthname',
114 'localmonthnamegen',
115 'localmonthabbrev',
116 'localday',
117 'localday2',
118 'localdayname',
119 'localyear',
120 'localtime',
121 'localhour',
122 'numberofarticles',
123 'numberoffiles',
124 'numberofedits',
125 'articlepath',
126 'pageid',
127 'sitename',
128 'server',
129 'servername',
130 'scriptpath',
131 'stylepath',
132 'pagename',
133 'pagenamee',
134 'fullpagename',
135 'fullpagenamee',
136 'namespace',
137 'namespacee',
138 'namespacenumber',
139 'currentweek',
140 'currentdow',
141 'localweek',
142 'localdow',
143 'revisionid',
144 'revisionday',
145 'revisionday2',
146 'revisionmonth',
147 'revisionmonth1',
148 'revisionyear',
149 'revisiontimestamp',
150 'revisionuser',
151 'revisionsize',
152 'subpagename',
153 'subpagenamee',
154 'talkspace',
155 'talkspacee',
156 'subjectspace',
157 'subjectspacee',
158 'talkpagename',
159 'talkpagenamee',
160 'subjectpagename',
161 'subjectpagenamee',
162 'numberofusers',
163 'numberofactiveusers',
164 'numberofpages',
165 'currentversion',
166 'rootpagename',
167 'rootpagenamee',
168 'basepagename',
169 'basepagenamee',
170 'currenttimestamp',
171 'localtimestamp',
172 'directionmark',
173 'contentlanguage',
174 'numberofadmins',
175 'numberofviews',
176 'cascadingsources',
179 /* Array of caching hints for ParserCache */
180 static public $mCacheTTLs = array(
181 'currentmonth' => 86400,
182 'currentmonth1' => 86400,
183 'currentmonthname' => 86400,
184 'currentmonthnamegen' => 86400,
185 'currentmonthabbrev' => 86400,
186 'currentday' => 3600,
187 'currentday2' => 3600,
188 'currentdayname' => 3600,
189 'currentyear' => 86400,
190 'currenttime' => 3600,
191 'currenthour' => 3600,
192 'localmonth' => 86400,
193 'localmonth1' => 86400,
194 'localmonthname' => 86400,
195 'localmonthnamegen' => 86400,
196 'localmonthabbrev' => 86400,
197 'localday' => 3600,
198 'localday2' => 3600,
199 'localdayname' => 3600,
200 'localyear' => 86400,
201 'localtime' => 3600,
202 'localhour' => 3600,
203 'numberofarticles' => 3600,
204 'numberoffiles' => 3600,
205 'numberofedits' => 3600,
206 'currentweek' => 3600,
207 'currentdow' => 3600,
208 'localweek' => 3600,
209 'localdow' => 3600,
210 'numberofusers' => 3600,
211 'numberofactiveusers' => 3600,
212 'numberofpages' => 3600,
213 'currentversion' => 86400,
214 'currenttimestamp' => 3600,
215 'localtimestamp' => 3600,
216 'pagesinnamespace' => 3600,
217 'numberofadmins' => 3600,
218 'numberofviews' => 3600,
219 'numberingroup' => 3600,
222 static public $mDoubleUnderscoreIDs = array(
223 'notoc',
224 'nogallery',
225 'forcetoc',
226 'toc',
227 'noeditsection',
228 'newsectionlink',
229 'nonewsectionlink',
230 'hiddencat',
231 'index',
232 'noindex',
233 'staticredirect',
234 'notitleconvert',
235 'nocontentconvert',
238 static public $mSubstIDs = array(
239 'subst',
240 'safesubst',
243 static public $mObjects = array();
244 static public $mDoubleUnderscoreArray = null;
246 /**#@-*/
248 function __construct( $id = 0, $syn = array(), $cs = false ) {
249 $this->mId = $id;
250 $this->mSynonyms = (array)$syn;
251 $this->mCaseSensitive = $cs;
255 * Factory: creates an object representing an ID
257 * @param int $id
259 * @return MagicWord
261 static function &get( $id ) {
262 if ( !isset( self::$mObjects[$id] ) ) {
263 $mw = new MagicWord();
264 $mw->load( $id );
265 self::$mObjects[$id] = $mw;
267 return self::$mObjects[$id];
271 * Get an array of parser variable IDs
273 * @return array
275 static function getVariableIDs() {
276 if ( !self::$mVariableIDsInitialised ) {
277 # Get variable IDs
278 wfRunHooks( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
279 self::$mVariableIDsInitialised = true;
281 return self::$mVariableIDs;
285 * Get an array of parser substitution modifier IDs
286 * @return array
288 static function getSubstIDs() {
289 return self::$mSubstIDs;
293 * Allow external reads of TTL array
295 * @param int $id
296 * @return int
298 static function getCacheTTL( $id ) {
299 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
300 return self::$mCacheTTLs[$id];
301 } else {
302 return -1;
307 * Get a MagicWordArray of double-underscore entities
309 * @return MagicWordArray
311 static function getDoubleUnderscoreArray() {
312 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
313 wfRunHooks( 'GetDoubleUnderscoreIDs', array( &self::$mDoubleUnderscoreIDs ) );
314 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
316 return self::$mDoubleUnderscoreArray;
320 * Clear the self::$mObjects variable
321 * For use in parser tests
323 public static function clearCache() {
324 self::$mObjects = array();
328 * Initialises this object with an ID
330 * @param int $id
331 * @throws MWException
333 function load( $id ) {
334 global $wgContLang;
335 wfProfileIn( __METHOD__ );
336 $this->mId = $id;
337 $wgContLang->getMagic( $this );
338 if ( !$this->mSynonyms ) {
339 $this->mSynonyms = array( 'brionmademeputthishere' );
340 wfProfileOut( __METHOD__ );
341 throw new MWException( "Error: invalid magic word '$id'" );
343 wfProfileOut( __METHOD__ );
347 * Preliminary initialisation
348 * @private
350 function initRegex() {
351 // Sort the synonyms by length, descending, so that the longest synonym
352 // matches in precedence to the shortest
353 $synonyms = $this->mSynonyms;
354 usort( $synonyms, array( $this, 'compareStringLength' ) );
356 $escSyn = array();
357 foreach ( $synonyms as $synonym ) {
358 // In case a magic word contains /, like that's going to happen;)
359 $escSyn[] = preg_quote( $synonym, '/' );
361 $this->mBaseRegex = implode( '|', $escSyn );
363 $case = $this->mCaseSensitive ? '' : 'iu';
364 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
365 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
366 $this->mRegexStartToEnd = "/^(?:{$this->mBaseRegex})$/{$case}";
367 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
368 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
369 "/^(?:{$this->mBaseRegex})$/{$case}" );
373 * A comparison function that returns -1, 0 or 1 depending on whether the
374 * first string is longer, the same length or shorter than the second
375 * string.
377 * @param string $s1
378 * @param string $s2
380 * @return int
382 function compareStringLength( $s1, $s2 ) {
383 $l1 = strlen( $s1 );
384 $l2 = strlen( $s2 );
385 if ( $l1 < $l2 ) {
386 return 1;
387 } elseif ( $l1 > $l2 ) {
388 return -1;
389 } else {
390 return 0;
395 * Gets a regex representing matching the word
397 * @return string
399 function getRegex() {
400 if ( $this->mRegex == '' ) {
401 $this->initRegex();
403 return $this->mRegex;
407 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
408 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
409 * the complete expression
411 * @return string
413 function getRegexCase() {
414 if ( $this->mRegex === '' ) {
415 $this->initRegex();
418 return $this->mCaseSensitive ? '' : 'iu';
422 * Gets a regex matching the word, if it is at the string start
424 * @return string
426 function getRegexStart() {
427 if ( $this->mRegex == '' ) {
428 $this->initRegex();
430 return $this->mRegexStart;
434 * Gets a regex matching the word from start to end of a string
436 * @return string
437 * @since 1.23
439 function getRegexStartToEnd() {
440 if ( $this->mRegexStartToEnd == '' ) {
441 $this->initRegex();
443 return $this->mRegexStartToEnd;
447 * regex without the slashes and what not
449 * @return string
451 function getBaseRegex() {
452 if ( $this->mRegex == '' ) {
453 $this->initRegex();
455 return $this->mBaseRegex;
459 * Returns true if the text contains the word
461 * @param string $text
463 * @return bool
465 function match( $text ) {
466 return (bool)preg_match( $this->getRegex(), $text );
470 * Returns true if the text starts with the word
472 * @param string $text
474 * @return bool
476 function matchStart( $text ) {
477 return (bool)preg_match( $this->getRegexStart(), $text );
481 * Returns true if the text matched the word
483 * @param string $text
485 * @return bool
486 * @since 1.23
488 function matchStartToEnd( $text ) {
489 return (bool)preg_match( $this->getRegexStartToEnd(), $text );
493 * Returns NULL if there's no match, the value of $1 otherwise
494 * The return code is the matched string, if there's no variable
495 * part in the regex and the matched variable part ($1) if there
496 * is one.
498 * @param string $text
500 * @return string
502 function matchVariableStartToEnd( $text ) {
503 $matches = array();
504 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
505 if ( $matchcount == 0 ) {
506 return null;
507 } else {
508 # multiple matched parts (variable match); some will be empty because of
509 # synonyms. The variable will be the second non-empty one so remove any
510 # blank elements and re-sort the indices.
511 # See also bug 6526
513 $matches = array_values( array_filter( $matches ) );
515 if ( count( $matches ) == 1 ) {
516 return $matches[0];
517 } else {
518 return $matches[1];
524 * Returns true if the text matches the word, and alters the
525 * input string, removing all instances of the word
527 * @param string $text
529 * @return bool
531 function matchAndRemove( &$text ) {
532 $this->mFound = false;
533 $text = preg_replace_callback(
534 $this->getRegex(),
535 array( &$this, 'pregRemoveAndRecord' ),
536 $text
539 return $this->mFound;
543 * @param string $text
544 * @return bool
546 function matchStartAndRemove( &$text ) {
547 $this->mFound = false;
548 $text = preg_replace_callback(
549 $this->getRegexStart(),
550 array( &$this, 'pregRemoveAndRecord' ),
551 $text
554 return $this->mFound;
558 * Used in matchAndRemove()
560 * @return string
562 function pregRemoveAndRecord() {
563 $this->mFound = true;
564 return '';
568 * Replaces the word with something else
570 * @param string $replacement
571 * @param string $subject
572 * @param int $limit
574 * @return string
576 function replace( $replacement, $subject, $limit = -1 ) {
577 $res = preg_replace(
578 $this->getRegex(),
579 StringUtils::escapeRegexReplacement( $replacement ),
580 $subject,
581 $limit
583 $this->mModified = $res !== $subject;
584 return $res;
588 * Variable handling: {{SUBST:xxx}} style words
589 * Calls back a function to determine what to replace xxx with
590 * Input word must contain $1
592 * @param string $text
593 * @param callable $callback
595 * @return string
597 function substituteCallback( $text, $callback ) {
598 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
599 $this->mModified = $res !== $text;
600 return $res;
604 * Matches the word, where $1 is a wildcard
606 * @return string
608 function getVariableRegex() {
609 if ( $this->mVariableRegex == '' ) {
610 $this->initRegex();
612 return $this->mVariableRegex;
616 * Matches the entire string, where $1 is a wildcard
618 * @return string
620 function getVariableStartToEndRegex() {
621 if ( $this->mVariableStartToEndRegex == '' ) {
622 $this->initRegex();
624 return $this->mVariableStartToEndRegex;
628 * Accesses the synonym list directly
630 * @param int $i
632 * @return string
634 function getSynonym( $i ) {
635 return $this->mSynonyms[$i];
639 * @return array
641 function getSynonyms() {
642 return $this->mSynonyms;
646 * Returns true if the last call to replace() or substituteCallback()
647 * returned a modified text, otherwise false.
649 * @return bool
651 function getWasModified() {
652 return $this->mModified;
656 * $magicarr is an associative array of (magic word ID => replacement)
657 * This method uses the php feature to do several replacements at the same time,
658 * thereby gaining some efficiency. The result is placed in the out variable
659 * $result. The return value is true if something was replaced.
660 * @todo Should this be static? It doesn't seem to be used at all
662 * @param array $magicarr
663 * @param string $subject
664 * @param string $result
666 * @return bool
668 function replaceMultiple( $magicarr, $subject, &$result ) {
669 $search = array();
670 $replace = array();
671 foreach ( $magicarr as $id => $replacement ) {
672 $mw = MagicWord::get( $id );
673 $search[] = $mw->getRegex();
674 $replace[] = $replacement;
677 $result = preg_replace( $search, $replace, $subject );
678 return $result !== $subject;
682 * Adds all the synonyms of this MagicWord to an array, to allow quick
683 * lookup in a list of magic words
685 * @param array $array
686 * @param string $value
688 function addToArray( &$array, $value ) {
689 global $wgContLang;
690 foreach ( $this->mSynonyms as $syn ) {
691 $array[$wgContLang->lc( $syn )] = $value;
696 * @return bool
698 function isCaseSensitive() {
699 return $this->mCaseSensitive;
703 * @return int
705 function getId() {
706 return $this->mId;
711 * Class for handling an array of magic words
712 * @ingroup Parser
714 class MagicWordArray {
715 /** @var array */
716 public $names = array();
718 /** @var array */
719 private $hash;
721 private $baseRegex;
723 private $regex;
725 /** @todo Unused? */
726 private $matches;
729 * @param array $names
731 function __construct( $names = array() ) {
732 $this->names = $names;
736 * Add a magic word by name
738 * @param string $name
740 public function add( $name ) {
741 $this->names[] = $name;
742 $this->hash = $this->baseRegex = $this->regex = null;
746 * Add a number of magic words by name
748 * @param array $names
750 public function addArray( $names ) {
751 $this->names = array_merge( $this->names, array_values( $names ) );
752 $this->hash = $this->baseRegex = $this->regex = null;
756 * Get a 2-d hashtable for this array
757 * @return array
759 function getHash() {
760 if ( is_null( $this->hash ) ) {
761 global $wgContLang;
762 $this->hash = array( 0 => array(), 1 => array() );
763 foreach ( $this->names as $name ) {
764 $magic = MagicWord::get( $name );
765 $case = intval( $magic->isCaseSensitive() );
766 foreach ( $magic->getSynonyms() as $syn ) {
767 if ( !$case ) {
768 $syn = $wgContLang->lc( $syn );
770 $this->hash[$case][$syn] = $name;
774 return $this->hash;
778 * Get the base regex
779 * @return array
781 function getBaseRegex() {
782 if ( is_null( $this->baseRegex ) ) {
783 $this->baseRegex = array( 0 => '', 1 => '' );
784 foreach ( $this->names as $name ) {
785 $magic = MagicWord::get( $name );
786 $case = intval( $magic->isCaseSensitive() );
787 foreach ( $magic->getSynonyms() as $i => $syn ) {
788 // Group name must start with a non-digit in PCRE 8.34+
789 $it = strtr( $i, '0123456789', 'abcdefghij' );
790 $group = "(?P<{$it}_{$name}>" . preg_quote( $syn, '/' ) . ')';
791 if ( $this->baseRegex[$case] === '' ) {
792 $this->baseRegex[$case] = $group;
793 } else {
794 $this->baseRegex[$case] .= '|' . $group;
799 return $this->baseRegex;
803 * Get an unanchored regex that does not match parameters
804 * @return array
806 function getRegex() {
807 if ( is_null( $this->regex ) ) {
808 $base = $this->getBaseRegex();
809 $this->regex = array( '', '' );
810 if ( $this->baseRegex[0] !== '' ) {
811 $this->regex[0] = "/{$base[0]}/iuS";
813 if ( $this->baseRegex[1] !== '' ) {
814 $this->regex[1] = "/{$base[1]}/S";
817 return $this->regex;
821 * Get a regex for matching variables with parameters
823 * @return string
825 function getVariableRegex() {
826 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
830 * Get a regex anchored to the start of the string that does not match parameters
832 * @return array
834 function getRegexStart() {
835 $base = $this->getBaseRegex();
836 $newRegex = array( '', '' );
837 if ( $base[0] !== '' ) {
838 $newRegex[0] = "/^(?:{$base[0]})/iuS";
840 if ( $base[1] !== '' ) {
841 $newRegex[1] = "/^(?:{$base[1]})/S";
843 return $newRegex;
847 * Get an anchored regex for matching variables with parameters
849 * @return array
851 function getVariableStartToEndRegex() {
852 $base = $this->getBaseRegex();
853 $newRegex = array( '', '' );
854 if ( $base[0] !== '' ) {
855 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
857 if ( $base[1] !== '' ) {
858 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
860 return $newRegex;
864 * @since 1.20
865 * @return array
867 public function getNames() {
868 return $this->names;
872 * Parse a match array from preg_match
873 * Returns array(magic word ID, parameter value)
874 * If there is no parameter value, that element will be false.
876 * @param array $m
878 * @throws MWException
879 * @return array
881 function parseMatch( $m ) {
882 reset( $m );
883 while ( list( $key, $value ) = each( $m ) ) {
884 if ( $key === 0 || $value === '' ) {
885 continue;
887 $parts = explode( '_', $key, 2 );
888 if ( count( $parts ) != 2 ) {
889 // This shouldn't happen
890 // continue;
891 throw new MWException( __METHOD__ . ': bad parameter name' );
893 list( /* $synIndex */, $magicName ) = $parts;
894 $paramValue = next( $m );
895 return array( $magicName, $paramValue );
897 // This shouldn't happen either
898 throw new MWException( __METHOD__ . ': parameter not found' );
902 * Match some text, with parameter capture
903 * Returns an array with the magic word name in the first element and the
904 * parameter in the second element.
905 * Both elements are false if there was no match.
907 * @param string $text
909 * @return array
911 public function matchVariableStartToEnd( $text ) {
912 $regexes = $this->getVariableStartToEndRegex();
913 foreach ( $regexes as $regex ) {
914 if ( $regex !== '' ) {
915 $m = array();
916 if ( preg_match( $regex, $text, $m ) ) {
917 return $this->parseMatch( $m );
921 return array( false, false );
925 * Match some text, without parameter capture
926 * Returns the magic word name, or false if there was no capture
928 * @param string $text
930 * @return string|bool False on failure
932 public function matchStartToEnd( $text ) {
933 $hash = $this->getHash();
934 if ( isset( $hash[1][$text] ) ) {
935 return $hash[1][$text];
937 global $wgContLang;
938 $lc = $wgContLang->lc( $text );
939 if ( isset( $hash[0][$lc] ) ) {
940 return $hash[0][$lc];
942 return false;
946 * Returns an associative array, ID => param value, for all items that match
947 * Removes the matched items from the input string (passed by reference)
949 * @param string $text
951 * @return array
953 public function matchAndRemove( &$text ) {
954 $found = array();
955 $regexes = $this->getRegex();
956 foreach ( $regexes as $regex ) {
957 if ( $regex === '' ) {
958 continue;
960 preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
961 foreach ( $matches as $m ) {
962 list( $name, $param ) = $this->parseMatch( $m );
963 $found[$name] = $param;
965 $text = preg_replace( $regex, '', $text );
967 return $found;
971 * Return the ID of the magic word at the start of $text, and remove
972 * the prefix from $text.
973 * Return false if no match found and $text is not modified.
974 * Does not match parameters.
976 * @param string $text
978 * @return int|bool False on failure
980 public function matchStartAndRemove( &$text ) {
981 $regexes = $this->getRegexStart();
982 foreach ( $regexes as $regex ) {
983 if ( $regex === '' ) {
984 continue;
986 if ( preg_match( $regex, $text, $m ) ) {
987 list( $id, ) = $this->parseMatch( $m );
988 if ( strlen( $m[0] ) >= strlen( $text ) ) {
989 $text = '';
990 } else {
991 $text = substr( $text, strlen( $m[0] ) );
993 return $id;
996 return false;