Add a sort parameter to SearchEngine
[mediawiki.git] / includes / MagicWord.php
blob4b24a00d86cfe3eaa5690966bc84ce3a29c716f7
1 <?php
2 /**
3 * File for magic words.
5 * See docs/magicword.txt.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup Parser
26 /**
27 * This class encapsulates "magic words" such as "#redirect", __NOTOC__, etc.
29 * @par Usage:
30 * @code
31 * if (MagicWord::get( 'redirect' )->match( $text ) ) {
32 * // some code
33 * }
34 * @endcode
36 * Possible future improvements:
37 * * Simultaneous searching for a number of magic words
38 * * MagicWord::$mObjects in shared memory
40 * Please avoid reading the data out of one of these objects and then writing
41 * special case code. If possible, add another match()-like function here.
43 * To add magic words in an extension, use $magicWords in a file listed in
44 * $wgExtensionMessagesFiles[].
46 * @par Example:
47 * @code
48 * $magicWords = array();
50 * $magicWords['en'] = array(
51 * 'magicwordkey' => array( 0, 'case_insensitive_magic_word' ),
52 * 'magicwordkey2' => array( 1, 'CASE_sensitive_magic_word2' ),
53 * );
54 * @endcode
56 * For magic words which are also Parser variables, add a MagicWordwgVariableIDs
57 * hook. Use string keys.
59 * @ingroup Parser
61 class MagicWord {
62 /**#@-*/
64 /** @var int */
65 public $mId;
67 /** @var array */
68 public $mSynonyms;
70 /** @var bool */
71 public $mCaseSensitive;
73 /** @var string */
74 private $mRegex = '';
76 /** @var string */
77 private $mRegexStart = '';
79 /** @var string */
80 private $mRegexStartToEnd = '';
82 /** @var string */
83 private $mBaseRegex = '';
85 /** @var string */
86 private $mVariableRegex = '';
88 /** @var string */
89 private $mVariableStartToEndRegex = '';
91 /** @var bool */
92 private $mModified = false;
94 /** @var bool */
95 private $mFound = false;
97 static public $mVariableIDsInitialised = false;
98 static public $mVariableIDs = array(
99 '!',
100 'currentmonth',
101 'currentmonth1',
102 'currentmonthname',
103 'currentmonthnamegen',
104 'currentmonthabbrev',
105 'currentday',
106 'currentday2',
107 'currentdayname',
108 'currentyear',
109 'currenttime',
110 'currenthour',
111 'localmonth',
112 'localmonth1',
113 'localmonthname',
114 'localmonthnamegen',
115 'localmonthabbrev',
116 'localday',
117 'localday2',
118 'localdayname',
119 'localyear',
120 'localtime',
121 'localhour',
122 'numberofarticles',
123 'numberoffiles',
124 'numberofedits',
125 'articlepath',
126 'pageid',
127 'sitename',
128 'server',
129 'servername',
130 'scriptpath',
131 'stylepath',
132 'pagename',
133 'pagenamee',
134 'fullpagename',
135 'fullpagenamee',
136 'namespace',
137 'namespacee',
138 'namespacenumber',
139 'currentweek',
140 'currentdow',
141 'localweek',
142 'localdow',
143 'revisionid',
144 'revisionday',
145 'revisionday2',
146 'revisionmonth',
147 'revisionmonth1',
148 'revisionyear',
149 'revisiontimestamp',
150 'revisionuser',
151 'revisionsize',
152 'subpagename',
153 'subpagenamee',
154 'talkspace',
155 'talkspacee',
156 'subjectspace',
157 'subjectspacee',
158 'talkpagename',
159 'talkpagenamee',
160 'subjectpagename',
161 'subjectpagenamee',
162 'numberofusers',
163 'numberofactiveusers',
164 'numberofpages',
165 'currentversion',
166 'rootpagename',
167 'rootpagenamee',
168 'basepagename',
169 'basepagenamee',
170 'currenttimestamp',
171 'localtimestamp',
172 'directionmark',
173 'contentlanguage',
174 'numberofadmins',
175 'cascadingsources',
178 /* Array of caching hints for ParserCache */
179 static public $mCacheTTLs = array(
180 'currentmonth' => 86400,
181 'currentmonth1' => 86400,
182 'currentmonthname' => 86400,
183 'currentmonthnamegen' => 86400,
184 'currentmonthabbrev' => 86400,
185 'currentday' => 3600,
186 'currentday2' => 3600,
187 'currentdayname' => 3600,
188 'currentyear' => 86400,
189 'currenttime' => 3600,
190 'currenthour' => 3600,
191 'localmonth' => 86400,
192 'localmonth1' => 86400,
193 'localmonthname' => 86400,
194 'localmonthnamegen' => 86400,
195 'localmonthabbrev' => 86400,
196 'localday' => 3600,
197 'localday2' => 3600,
198 'localdayname' => 3600,
199 'localyear' => 86400,
200 'localtime' => 3600,
201 'localhour' => 3600,
202 'numberofarticles' => 3600,
203 'numberoffiles' => 3600,
204 'numberofedits' => 3600,
205 'currentweek' => 3600,
206 'currentdow' => 3600,
207 'localweek' => 3600,
208 'localdow' => 3600,
209 'numberofusers' => 3600,
210 'numberofactiveusers' => 3600,
211 'numberofpages' => 3600,
212 'currentversion' => 86400,
213 'currenttimestamp' => 3600,
214 'localtimestamp' => 3600,
215 'pagesinnamespace' => 3600,
216 'numberofadmins' => 3600,
217 'numberingroup' => 3600,
220 static public $mDoubleUnderscoreIDs = array(
221 'notoc',
222 'nogallery',
223 'forcetoc',
224 'toc',
225 'noeditsection',
226 'newsectionlink',
227 'nonewsectionlink',
228 'hiddencat',
229 'index',
230 'noindex',
231 'staticredirect',
232 'notitleconvert',
233 'nocontentconvert',
236 static public $mSubstIDs = array(
237 'subst',
238 'safesubst',
241 static public $mObjects = array();
242 static public $mDoubleUnderscoreArray = null;
244 /**#@-*/
246 function __construct( $id = 0, $syn = array(), $cs = false ) {
247 $this->mId = $id;
248 $this->mSynonyms = (array)$syn;
249 $this->mCaseSensitive = $cs;
253 * Factory: creates an object representing an ID
255 * @param int $id
257 * @return MagicWord
259 static function &get( $id ) {
260 if ( !isset( self::$mObjects[$id] ) ) {
261 $mw = new MagicWord();
262 $mw->load( $id );
263 self::$mObjects[$id] = $mw;
265 return self::$mObjects[$id];
269 * Get an array of parser variable IDs
271 * @return array
273 static function getVariableIDs() {
274 if ( !self::$mVariableIDsInitialised ) {
275 # Get variable IDs
276 Hooks::run( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
277 self::$mVariableIDsInitialised = true;
279 return self::$mVariableIDs;
283 * Get an array of parser substitution modifier IDs
284 * @return array
286 static function getSubstIDs() {
287 return self::$mSubstIDs;
291 * Allow external reads of TTL array
293 * @param int $id
294 * @return int
296 static function getCacheTTL( $id ) {
297 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
298 return self::$mCacheTTLs[$id];
299 } else {
300 return -1;
305 * Get a MagicWordArray of double-underscore entities
307 * @return MagicWordArray
309 static function getDoubleUnderscoreArray() {
310 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
311 Hooks::run( 'GetDoubleUnderscoreIDs', array( &self::$mDoubleUnderscoreIDs ) );
312 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
314 return self::$mDoubleUnderscoreArray;
318 * Clear the self::$mObjects variable
319 * For use in parser tests
321 public static function clearCache() {
322 self::$mObjects = array();
326 * Initialises this object with an ID
328 * @param int $id
329 * @throws MWException
331 function load( $id ) {
332 global $wgContLang;
333 wfProfileIn( __METHOD__ );
334 $this->mId = $id;
335 $wgContLang->getMagic( $this );
336 if ( !$this->mSynonyms ) {
337 $this->mSynonyms = array( 'brionmademeputthishere' );
338 wfProfileOut( __METHOD__ );
339 throw new MWException( "Error: invalid magic word '$id'" );
341 wfProfileOut( __METHOD__ );
345 * Preliminary initialisation
346 * @private
348 function initRegex() {
349 // Sort the synonyms by length, descending, so that the longest synonym
350 // matches in precedence to the shortest
351 $synonyms = $this->mSynonyms;
352 usort( $synonyms, array( $this, 'compareStringLength' ) );
354 $escSyn = array();
355 foreach ( $synonyms as $synonym ) {
356 // In case a magic word contains /, like that's going to happen;)
357 $escSyn[] = preg_quote( $synonym, '/' );
359 $this->mBaseRegex = implode( '|', $escSyn );
361 $case = $this->mCaseSensitive ? '' : 'iu';
362 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
363 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
364 $this->mRegexStartToEnd = "/^(?:{$this->mBaseRegex})$/{$case}";
365 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
366 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
367 "/^(?:{$this->mBaseRegex})$/{$case}" );
371 * A comparison function that returns -1, 0 or 1 depending on whether the
372 * first string is longer, the same length or shorter than the second
373 * string.
375 * @param string $s1
376 * @param string $s2
378 * @return int
380 function compareStringLength( $s1, $s2 ) {
381 $l1 = strlen( $s1 );
382 $l2 = strlen( $s2 );
383 if ( $l1 < $l2 ) {
384 return 1;
385 } elseif ( $l1 > $l2 ) {
386 return -1;
387 } else {
388 return 0;
393 * Gets a regex representing matching the word
395 * @return string
397 function getRegex() {
398 if ( $this->mRegex == '' ) {
399 $this->initRegex();
401 return $this->mRegex;
405 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
406 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
407 * the complete expression
409 * @return string
411 function getRegexCase() {
412 if ( $this->mRegex === '' ) {
413 $this->initRegex();
416 return $this->mCaseSensitive ? '' : 'iu';
420 * Gets a regex matching the word, if it is at the string start
422 * @return string
424 function getRegexStart() {
425 if ( $this->mRegex == '' ) {
426 $this->initRegex();
428 return $this->mRegexStart;
432 * Gets a regex matching the word from start to end of a string
434 * @return string
435 * @since 1.23
437 function getRegexStartToEnd() {
438 if ( $this->mRegexStartToEnd == '' ) {
439 $this->initRegex();
441 return $this->mRegexStartToEnd;
445 * regex without the slashes and what not
447 * @return string
449 function getBaseRegex() {
450 if ( $this->mRegex == '' ) {
451 $this->initRegex();
453 return $this->mBaseRegex;
457 * Returns true if the text contains the word
459 * @param string $text
461 * @return bool
463 function match( $text ) {
464 return (bool)preg_match( $this->getRegex(), $text );
468 * Returns true if the text starts with the word
470 * @param string $text
472 * @return bool
474 function matchStart( $text ) {
475 return (bool)preg_match( $this->getRegexStart(), $text );
479 * Returns true if the text matched the word
481 * @param string $text
483 * @return bool
484 * @since 1.23
486 function matchStartToEnd( $text ) {
487 return (bool)preg_match( $this->getRegexStartToEnd(), $text );
491 * Returns NULL if there's no match, the value of $1 otherwise
492 * The return code is the matched string, if there's no variable
493 * part in the regex and the matched variable part ($1) if there
494 * is one.
496 * @param string $text
498 * @return string
500 function matchVariableStartToEnd( $text ) {
501 $matches = array();
502 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
503 if ( $matchcount == 0 ) {
504 return null;
505 } else {
506 # multiple matched parts (variable match); some will be empty because of
507 # synonyms. The variable will be the second non-empty one so remove any
508 # blank elements and re-sort the indices.
509 # See also bug 6526
511 $matches = array_values( array_filter( $matches ) );
513 if ( count( $matches ) == 1 ) {
514 return $matches[0];
515 } else {
516 return $matches[1];
522 * Returns true if the text matches the word, and alters the
523 * input string, removing all instances of the word
525 * @param string $text
527 * @return bool
529 function matchAndRemove( &$text ) {
530 $this->mFound = false;
531 $text = preg_replace_callback(
532 $this->getRegex(),
533 array( &$this, 'pregRemoveAndRecord' ),
534 $text
537 return $this->mFound;
541 * @param string $text
542 * @return bool
544 function matchStartAndRemove( &$text ) {
545 $this->mFound = false;
546 $text = preg_replace_callback(
547 $this->getRegexStart(),
548 array( &$this, 'pregRemoveAndRecord' ),
549 $text
552 return $this->mFound;
556 * Used in matchAndRemove()
558 * @return string
560 function pregRemoveAndRecord() {
561 $this->mFound = true;
562 return '';
566 * Replaces the word with something else
568 * @param string $replacement
569 * @param string $subject
570 * @param int $limit
572 * @return string
574 function replace( $replacement, $subject, $limit = -1 ) {
575 $res = preg_replace(
576 $this->getRegex(),
577 StringUtils::escapeRegexReplacement( $replacement ),
578 $subject,
579 $limit
581 $this->mModified = $res !== $subject;
582 return $res;
586 * Variable handling: {{SUBST:xxx}} style words
587 * Calls back a function to determine what to replace xxx with
588 * Input word must contain $1
590 * @param string $text
591 * @param callable $callback
593 * @return string
595 function substituteCallback( $text, $callback ) {
596 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
597 $this->mModified = $res !== $text;
598 return $res;
602 * Matches the word, where $1 is a wildcard
604 * @return string
606 function getVariableRegex() {
607 if ( $this->mVariableRegex == '' ) {
608 $this->initRegex();
610 return $this->mVariableRegex;
614 * Matches the entire string, where $1 is a wildcard
616 * @return string
618 function getVariableStartToEndRegex() {
619 if ( $this->mVariableStartToEndRegex == '' ) {
620 $this->initRegex();
622 return $this->mVariableStartToEndRegex;
626 * Accesses the synonym list directly
628 * @param int $i
630 * @return string
632 function getSynonym( $i ) {
633 return $this->mSynonyms[$i];
637 * @return array
639 function getSynonyms() {
640 return $this->mSynonyms;
644 * Returns true if the last call to replace() or substituteCallback()
645 * returned a modified text, otherwise false.
647 * @return bool
649 function getWasModified() {
650 return $this->mModified;
654 * $magicarr is an associative array of (magic word ID => replacement)
655 * This method uses the php feature to do several replacements at the same time,
656 * thereby gaining some efficiency. The result is placed in the out variable
657 * $result. The return value is true if something was replaced.
658 * @todo Should this be static? It doesn't seem to be used at all
660 * @param array $magicarr
661 * @param string $subject
662 * @param string $result
664 * @return bool
666 function replaceMultiple( $magicarr, $subject, &$result ) {
667 $search = array();
668 $replace = array();
669 foreach ( $magicarr as $id => $replacement ) {
670 $mw = MagicWord::get( $id );
671 $search[] = $mw->getRegex();
672 $replace[] = $replacement;
675 $result = preg_replace( $search, $replace, $subject );
676 return $result !== $subject;
680 * Adds all the synonyms of this MagicWord to an array, to allow quick
681 * lookup in a list of magic words
683 * @param array $array
684 * @param string $value
686 function addToArray( &$array, $value ) {
687 global $wgContLang;
688 foreach ( $this->mSynonyms as $syn ) {
689 $array[$wgContLang->lc( $syn )] = $value;
694 * @return bool
696 function isCaseSensitive() {
697 return $this->mCaseSensitive;
701 * @return int
703 function getId() {
704 return $this->mId;
709 * Class for handling an array of magic words
710 * @ingroup Parser
712 class MagicWordArray {
713 /** @var array */
714 public $names = array();
716 /** @var array */
717 private $hash;
719 private $baseRegex;
721 private $regex;
723 /** @todo Unused? */
724 private $matches;
727 * @param array $names
729 function __construct( $names = array() ) {
730 $this->names = $names;
734 * Add a magic word by name
736 * @param string $name
738 public function add( $name ) {
739 $this->names[] = $name;
740 $this->hash = $this->baseRegex = $this->regex = null;
744 * Add a number of magic words by name
746 * @param array $names
748 public function addArray( $names ) {
749 $this->names = array_merge( $this->names, array_values( $names ) );
750 $this->hash = $this->baseRegex = $this->regex = null;
754 * Get a 2-d hashtable for this array
755 * @return array
757 function getHash() {
758 if ( is_null( $this->hash ) ) {
759 global $wgContLang;
760 $this->hash = array( 0 => array(), 1 => array() );
761 foreach ( $this->names as $name ) {
762 $magic = MagicWord::get( $name );
763 $case = intval( $magic->isCaseSensitive() );
764 foreach ( $magic->getSynonyms() as $syn ) {
765 if ( !$case ) {
766 $syn = $wgContLang->lc( $syn );
768 $this->hash[$case][$syn] = $name;
772 return $this->hash;
776 * Get the base regex
777 * @return array
779 function getBaseRegex() {
780 if ( is_null( $this->baseRegex ) ) {
781 $this->baseRegex = array( 0 => '', 1 => '' );
782 foreach ( $this->names as $name ) {
783 $magic = MagicWord::get( $name );
784 $case = intval( $magic->isCaseSensitive() );
785 foreach ( $magic->getSynonyms() as $i => $syn ) {
786 // Group name must start with a non-digit in PCRE 8.34+
787 $it = strtr( $i, '0123456789', 'abcdefghij' );
788 $group = "(?P<{$it}_{$name}>" . preg_quote( $syn, '/' ) . ')';
789 if ( $this->baseRegex[$case] === '' ) {
790 $this->baseRegex[$case] = $group;
791 } else {
792 $this->baseRegex[$case] .= '|' . $group;
797 return $this->baseRegex;
801 * Get an unanchored regex that does not match parameters
802 * @return array
804 function getRegex() {
805 if ( is_null( $this->regex ) ) {
806 $base = $this->getBaseRegex();
807 $this->regex = array( '', '' );
808 if ( $this->baseRegex[0] !== '' ) {
809 $this->regex[0] = "/{$base[0]}/iuS";
811 if ( $this->baseRegex[1] !== '' ) {
812 $this->regex[1] = "/{$base[1]}/S";
815 return $this->regex;
819 * Get a regex for matching variables with parameters
821 * @return string
823 function getVariableRegex() {
824 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
828 * Get a regex anchored to the start of the string that does not match parameters
830 * @return array
832 function getRegexStart() {
833 $base = $this->getBaseRegex();
834 $newRegex = array( '', '' );
835 if ( $base[0] !== '' ) {
836 $newRegex[0] = "/^(?:{$base[0]})/iuS";
838 if ( $base[1] !== '' ) {
839 $newRegex[1] = "/^(?:{$base[1]})/S";
841 return $newRegex;
845 * Get an anchored regex for matching variables with parameters
847 * @return array
849 function getVariableStartToEndRegex() {
850 $base = $this->getBaseRegex();
851 $newRegex = array( '', '' );
852 if ( $base[0] !== '' ) {
853 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
855 if ( $base[1] !== '' ) {
856 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
858 return $newRegex;
862 * @since 1.20
863 * @return array
865 public function getNames() {
866 return $this->names;
870 * Parse a match array from preg_match
871 * Returns array(magic word ID, parameter value)
872 * If there is no parameter value, that element will be false.
874 * @param array $m
876 * @throws MWException
877 * @return array
879 function parseMatch( $m ) {
880 reset( $m );
881 while ( list( $key, $value ) = each( $m ) ) {
882 if ( $key === 0 || $value === '' ) {
883 continue;
885 $parts = explode( '_', $key, 2 );
886 if ( count( $parts ) != 2 ) {
887 // This shouldn't happen
888 // continue;
889 throw new MWException( __METHOD__ . ': bad parameter name' );
891 list( /* $synIndex */, $magicName ) = $parts;
892 $paramValue = next( $m );
893 return array( $magicName, $paramValue );
895 // This shouldn't happen either
896 throw new MWException( __METHOD__ . ': parameter not found' );
900 * Match some text, with parameter capture
901 * Returns an array with the magic word name in the first element and the
902 * parameter in the second element.
903 * Both elements are false if there was no match.
905 * @param string $text
907 * @return array
909 public function matchVariableStartToEnd( $text ) {
910 $regexes = $this->getVariableStartToEndRegex();
911 foreach ( $regexes as $regex ) {
912 if ( $regex !== '' ) {
913 $m = array();
914 if ( preg_match( $regex, $text, $m ) ) {
915 return $this->parseMatch( $m );
919 return array( false, false );
923 * Match some text, without parameter capture
924 * Returns the magic word name, or false if there was no capture
926 * @param string $text
928 * @return string|bool False on failure
930 public function matchStartToEnd( $text ) {
931 $hash = $this->getHash();
932 if ( isset( $hash[1][$text] ) ) {
933 return $hash[1][$text];
935 global $wgContLang;
936 $lc = $wgContLang->lc( $text );
937 if ( isset( $hash[0][$lc] ) ) {
938 return $hash[0][$lc];
940 return false;
944 * Returns an associative array, ID => param value, for all items that match
945 * Removes the matched items from the input string (passed by reference)
947 * @param string $text
949 * @return array
951 public function matchAndRemove( &$text ) {
952 $found = array();
953 $regexes = $this->getRegex();
954 foreach ( $regexes as $regex ) {
955 if ( $regex === '' ) {
956 continue;
958 preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
959 foreach ( $matches as $m ) {
960 list( $name, $param ) = $this->parseMatch( $m );
961 $found[$name] = $param;
963 $text = preg_replace( $regex, '', $text );
965 return $found;
969 * Return the ID of the magic word at the start of $text, and remove
970 * the prefix from $text.
971 * Return false if no match found and $text is not modified.
972 * Does not match parameters.
974 * @param string $text
976 * @return int|bool False on failure
978 public function matchStartAndRemove( &$text ) {
979 $regexes = $this->getRegexStart();
980 foreach ( $regexes as $regex ) {
981 if ( $regex === '' ) {
982 continue;
984 if ( preg_match( $regex, $text, $m ) ) {
985 list( $id, ) = $this->parseMatch( $m );
986 if ( strlen( $m[0] ) >= strlen( $text ) ) {
987 $text = '';
988 } else {
989 $text = substr( $text, strlen( $m[0] ) );
991 return $id;
994 return false;