Remove score display from search engine
[mediawiki.git] / includes / MagicWord.php
blob7decbee0ab5dfc498e3d963e12336a43ffcbe530
1 <?php
2 /**
3 * File for magic words.
5 * See docs/magicword.txt.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup Parser
26 /**
27 * This class encapsulates "magic words" such as "#redirect", __NOTOC__, etc.
29 * @par Usage:
30 * @code
31 * if (MagicWord::get( 'redirect' )->match( $text ) ) {
32 * // some code
33 * }
34 * @endcode
36 * Possible future improvements:
37 * * Simultaneous searching for a number of magic words
38 * * MagicWord::$mObjects in shared memory
40 * Please avoid reading the data out of one of these objects and then writing
41 * special case code. If possible, add another match()-like function here.
43 * To add magic words in an extension, use $magicWords in a file listed in
44 * $wgExtensionMessagesFiles[].
46 * @par Example:
47 * @code
48 * $magicWords = array();
50 * $magicWords['en'] = array(
51 * 'magicwordkey' => array( 0, 'case_insensitive_magic_word' ),
52 * 'magicwordkey2' => array( 1, 'CASE_sensitive_magic_word2' ),
53 * );
54 * @endcode
56 * For magic words which are also Parser variables, add a MagicWordwgVariableIDs
57 * hook. Use string keys.
59 * @ingroup Parser
61 class MagicWord {
62 /**#@-*/
64 /** @var int */
65 public $mId;
67 /** @var array */
68 public $mSynonyms;
70 /** @var bool */
71 public $mCaseSensitive;
73 /** @var string */
74 private $mRegex = '';
76 /** @var string */
77 private $mRegexStart = '';
79 /** @var string */
80 private $mRegexStartToEnd = '';
82 /** @var string */
83 private $mBaseRegex = '';
85 /** @var string */
86 private $mVariableRegex = '';
88 /** @var string */
89 private $mVariableStartToEndRegex = '';
91 /** @var bool */
92 private $mModified = false;
94 /** @var bool */
95 private $mFound = false;
97 static public $mVariableIDsInitialised = false;
98 static public $mVariableIDs = array(
99 '!',
100 'currentmonth',
101 'currentmonth1',
102 'currentmonthname',
103 'currentmonthnamegen',
104 'currentmonthabbrev',
105 'currentday',
106 'currentday2',
107 'currentdayname',
108 'currentyear',
109 'currenttime',
110 'currenthour',
111 'localmonth',
112 'localmonth1',
113 'localmonthname',
114 'localmonthnamegen',
115 'localmonthabbrev',
116 'localday',
117 'localday2',
118 'localdayname',
119 'localyear',
120 'localtime',
121 'localhour',
122 'numberofarticles',
123 'numberoffiles',
124 'numberofedits',
125 'articlepath',
126 'pageid',
127 'sitename',
128 'server',
129 'servername',
130 'scriptpath',
131 'stylepath',
132 'pagename',
133 'pagenamee',
134 'fullpagename',
135 'fullpagenamee',
136 'namespace',
137 'namespacee',
138 'namespacenumber',
139 'currentweek',
140 'currentdow',
141 'localweek',
142 'localdow',
143 'revisionid',
144 'revisionday',
145 'revisionday2',
146 'revisionmonth',
147 'revisionmonth1',
148 'revisionyear',
149 'revisiontimestamp',
150 'revisionuser',
151 'revisionsize',
152 'subpagename',
153 'subpagenamee',
154 'talkspace',
155 'talkspacee',
156 'subjectspace',
157 'subjectspacee',
158 'talkpagename',
159 'talkpagenamee',
160 'subjectpagename',
161 'subjectpagenamee',
162 'numberofusers',
163 'numberofactiveusers',
164 'numberofpages',
165 'currentversion',
166 'rootpagename',
167 'rootpagenamee',
168 'basepagename',
169 'basepagenamee',
170 'currenttimestamp',
171 'localtimestamp',
172 'directionmark',
173 'contentlanguage',
174 'numberofadmins',
175 'numberofviews',
176 'cascadingsources',
179 /* Array of caching hints for ParserCache */
180 static public $mCacheTTLs = array(
181 'currentmonth' => 86400,
182 'currentmonth1' => 86400,
183 'currentmonthname' => 86400,
184 'currentmonthnamegen' => 86400,
185 'currentmonthabbrev' => 86400,
186 'currentday' => 3600,
187 'currentday2' => 3600,
188 'currentdayname' => 3600,
189 'currentyear' => 86400,
190 'currenttime' => 3600,
191 'currenthour' => 3600,
192 'localmonth' => 86400,
193 'localmonth1' => 86400,
194 'localmonthname' => 86400,
195 'localmonthnamegen' => 86400,
196 'localmonthabbrev' => 86400,
197 'localday' => 3600,
198 'localday2' => 3600,
199 'localdayname' => 3600,
200 'localyear' => 86400,
201 'localtime' => 3600,
202 'localhour' => 3600,
203 'numberofarticles' => 3600,
204 'numberoffiles' => 3600,
205 'numberofedits' => 3600,
206 'currentweek' => 3600,
207 'currentdow' => 3600,
208 'localweek' => 3600,
209 'localdow' => 3600,
210 'numberofusers' => 3600,
211 'numberofactiveusers' => 3600,
212 'numberofpages' => 3600,
213 'currentversion' => 86400,
214 'currenttimestamp' => 3600,
215 'localtimestamp' => 3600,
216 'pagesinnamespace' => 3600,
217 'numberofadmins' => 3600,
218 'numberofviews' => 3600,
219 'numberingroup' => 3600,
222 static public $mDoubleUnderscoreIDs = array(
223 'notoc',
224 'nogallery',
225 'forcetoc',
226 'toc',
227 'noeditsection',
228 'newsectionlink',
229 'nonewsectionlink',
230 'hiddencat',
231 'index',
232 'noindex',
233 'staticredirect',
234 'notitleconvert',
235 'nocontentconvert',
238 static public $mSubstIDs = array(
239 'subst',
240 'safesubst',
243 static public $mObjects = array();
244 static public $mDoubleUnderscoreArray = null;
246 /**#@-*/
248 function __construct( $id = 0, $syn = array(), $cs = false ) {
249 $this->mId = $id;
250 $this->mSynonyms = (array)$syn;
251 $this->mCaseSensitive = $cs;
255 * Factory: creates an object representing an ID
257 * @param int $id
259 * @return MagicWord
261 static function &get( $id ) {
262 if ( !isset( self::$mObjects[$id] ) ) {
263 $mw = new MagicWord();
264 $mw->load( $id );
265 self::$mObjects[$id] = $mw;
267 return self::$mObjects[$id];
271 * Get an array of parser variable IDs
273 * @return array
275 static function getVariableIDs() {
276 if ( !self::$mVariableIDsInitialised ) {
277 # Get variable IDs
278 wfRunHooks( 'MagicWordwgVariableIDs', array( &self::$mVariableIDs ) );
279 self::$mVariableIDsInitialised = true;
281 return self::$mVariableIDs;
285 * Get an array of parser substitution modifier IDs
286 * @return array
288 static function getSubstIDs() {
289 return self::$mSubstIDs;
293 * Allow external reads of TTL array
295 * @param int $id
296 * @return int
298 static function getCacheTTL( $id ) {
299 if ( array_key_exists( $id, self::$mCacheTTLs ) ) {
300 return self::$mCacheTTLs[$id];
301 } else {
302 return -1;
307 * Get a MagicWordArray of double-underscore entities
309 * @return MagicWordArray
311 static function getDoubleUnderscoreArray() {
312 if ( is_null( self::$mDoubleUnderscoreArray ) ) {
313 wfRunHooks( 'GetDoubleUnderscoreIDs', array( &self::$mDoubleUnderscoreIDs ) );
314 self::$mDoubleUnderscoreArray = new MagicWordArray( self::$mDoubleUnderscoreIDs );
316 return self::$mDoubleUnderscoreArray;
320 * Clear the self::$mObjects variable
321 * For use in parser tests
323 public static function clearCache() {
324 self::$mObjects = array();
328 * Initialises this object with an ID
330 * @param int $id
331 * @throws MWException
333 function load( $id ) {
334 global $wgContLang;
335 wfProfileIn( __METHOD__ );
336 $this->mId = $id;
337 $wgContLang->getMagic( $this );
338 if ( !$this->mSynonyms ) {
339 $this->mSynonyms = array( 'brionmademeputthishere' );
340 wfProfileOut( __METHOD__ );
341 throw new MWException( "Error: invalid magic word '$id'" );
343 wfProfileOut( __METHOD__ );
347 * Preliminary initialisation
348 * @private
350 function initRegex() {
351 // Sort the synonyms by length, descending, so that the longest synonym
352 // matches in precedence to the shortest
353 $synonyms = $this->mSynonyms;
354 usort( $synonyms, array( $this, 'compareStringLength' ) );
356 $escSyn = array();
357 foreach ( $synonyms as $synonym ) {
358 // In case a magic word contains /, like that's going to happen;)
359 $escSyn[] = preg_quote( $synonym, '/' );
361 $this->mBaseRegex = implode( '|', $escSyn );
363 $case = $this->mCaseSensitive ? '' : 'iu';
364 $this->mRegex = "/{$this->mBaseRegex}/{$case}";
365 $this->mRegexStart = "/^(?:{$this->mBaseRegex})/{$case}";
366 $this->mRegexStartToEnd = "/^(?:{$this->mBaseRegex})$/{$case}";
367 $this->mVariableRegex = str_replace( "\\$1", "(.*?)", $this->mRegex );
368 $this->mVariableStartToEndRegex = str_replace( "\\$1", "(.*?)",
369 "/^(?:{$this->mBaseRegex})$/{$case}" );
373 * A comparison function that returns -1, 0 or 1 depending on whether the
374 * first string is longer, the same length or shorter than the second
375 * string.
377 * @param string $s1
378 * @param string $s2
380 * @return int
382 function compareStringLength( $s1, $s2 ) {
383 $l1 = strlen( $s1 );
384 $l2 = strlen( $s2 );
385 if ( $l1 < $l2 ) {
386 return 1;
387 } elseif ( $l1 > $l2 ) {
388 return -1;
389 } else {
390 return 0;
395 * Gets a regex representing matching the word
397 * @return string
399 function getRegex() {
400 if ( $this->mRegex == '' ) {
401 $this->initRegex();
403 return $this->mRegex;
407 * Gets the regexp case modifier to use, i.e. i or nothing, to be used if
408 * one is using MagicWord::getBaseRegex(), otherwise it'll be included in
409 * the complete expression
411 * @return string
413 function getRegexCase() {
414 if ( $this->mRegex === '' ) {
415 $this->initRegex();
418 return $this->mCaseSensitive ? '' : 'iu';
422 * Gets a regex matching the word, if it is at the string start
424 * @return string
426 function getRegexStart() {
427 if ( $this->mRegex == '' ) {
428 $this->initRegex();
430 return $this->mRegexStart;
434 * Gets a regex matching the word from start to end of a string
436 * @return string
437 * @since 1.23
439 function getRegexStartToEnd() {
440 if ( $this->mRegexStartToEnd == '' ) {
441 $this->initRegex();
443 return $this->mRegexStartToEnd;
447 * regex without the slashes and what not
449 * @return string
451 function getBaseRegex() {
452 if ( $this->mRegex == '' ) {
453 $this->initRegex();
455 return $this->mBaseRegex;
459 * Returns true if the text contains the word
461 * @param string $text
463 * @return bool
465 function match( $text ) {
466 return (bool)preg_match( $this->getRegex(), $text );
470 * Returns true if the text starts with the word
472 * @param string $text
474 * @return bool
476 function matchStart( $text ) {
477 return (bool)preg_match( $this->getRegexStart(), $text );
481 * Returns true if the text matched the word
483 * @param string $text
485 * @return bool
486 * @since 1.23
488 function matchStartToEnd( $text ) {
489 return (bool)preg_match( $this->getRegexStartToEnd(), $text );
493 * Returns NULL if there's no match, the value of $1 otherwise
494 * The return code is the matched string, if there's no variable
495 * part in the regex and the matched variable part ($1) if there
496 * is one.
498 * @param string $text
500 * @return string
502 function matchVariableStartToEnd( $text ) {
503 $matches = array();
504 $matchcount = preg_match( $this->getVariableStartToEndRegex(), $text, $matches );
505 if ( $matchcount == 0 ) {
506 return null;
507 } else {
508 # multiple matched parts (variable match); some will be empty because of
509 # synonyms. The variable will be the second non-empty one so remove any
510 # blank elements and re-sort the indices.
511 # See also bug 6526
513 $matches = array_values( array_filter( $matches ) );
515 if ( count( $matches ) == 1 ) {
516 return $matches[0];
517 } else {
518 return $matches[1];
524 * Returns true if the text matches the word, and alters the
525 * input string, removing all instances of the word
527 * @param string $text
529 * @return bool
531 function matchAndRemove( &$text ) {
532 $this->mFound = false;
533 $text = preg_replace_callback(
534 $this->getRegex(),
535 array( &$this, 'pregRemoveAndRecord' ),
536 $text
539 return $this->mFound;
543 * @param string $text
544 * @return bool
546 function matchStartAndRemove( &$text ) {
547 $this->mFound = false;
548 $text = preg_replace_callback(
549 $this->getRegexStart(),
550 array( &$this, 'pregRemoveAndRecord' ),
551 $text
554 return $this->mFound;
558 * Used in matchAndRemove()
560 * @return string
562 function pregRemoveAndRecord() {
563 $this->mFound = true;
564 return '';
568 * Replaces the word with something else
570 * @param string $replacement
571 * @param string $subject
572 * @param int $limit
574 * @return string
576 function replace( $replacement, $subject, $limit = -1 ) {
577 $res = preg_replace(
578 $this->getRegex(),
579 StringUtils::escapeRegexReplacement( $replacement ),
580 $subject,
581 $limit
583 $this->mModified = $res !== $subject;
584 return $res;
588 * Variable handling: {{SUBST:xxx}} style words
589 * Calls back a function to determine what to replace xxx with
590 * Input word must contain $1
592 * @param string $text
593 * @param callable $callback
595 * @return string
597 function substituteCallback( $text, $callback ) {
598 $res = preg_replace_callback( $this->getVariableRegex(), $callback, $text );
599 $this->mModified = $res !== $text;
600 return $res;
604 * Matches the word, where $1 is a wildcard
606 * @return string
608 function getVariableRegex() {
609 if ( $this->mVariableRegex == '' ) {
610 $this->initRegex();
612 return $this->mVariableRegex;
616 * Matches the entire string, where $1 is a wildcard
618 * @return string
620 function getVariableStartToEndRegex() {
621 if ( $this->mVariableStartToEndRegex == '' ) {
622 $this->initRegex();
624 return $this->mVariableStartToEndRegex;
628 * Accesses the synonym list directly
630 * @param int $i
632 * @return string
634 function getSynonym( $i ) {
635 return $this->mSynonyms[$i];
639 * @return array
641 function getSynonyms() {
642 return $this->mSynonyms;
646 * Returns true if the last call to replace() or substituteCallback()
647 * returned a modified text, otherwise false.
649 * @return bool
651 function getWasModified() {
652 return $this->mModified;
656 * $magicarr is an associative array of (magic word ID => replacement)
657 * This method uses the php feature to do several replacements at the same time,
658 * thereby gaining some efficiency. The result is placed in the out variable
659 * $result. The return value is true if something was replaced.
660 * @todo Should this be static? It doesn't seem to be used at all
662 * @param array $magicarr
663 * @param string $subject
664 * @param string $result
666 * @return bool
668 function replaceMultiple( $magicarr, $subject, &$result ) {
669 $search = array();
670 $replace = array();
671 foreach ( $magicarr as $id => $replacement ) {
672 $mw = MagicWord::get( $id );
673 $search[] = $mw->getRegex();
674 $replace[] = $replacement;
677 $result = preg_replace( $search, $replace, $subject );
678 return $result !== $subject;
682 * Adds all the synonyms of this MagicWord to an array, to allow quick
683 * lookup in a list of magic words
685 * @param array $array
686 * @param string $value
688 function addToArray( &$array, $value ) {
689 global $wgContLang;
690 foreach ( $this->mSynonyms as $syn ) {
691 $array[$wgContLang->lc( $syn )] = $value;
696 * @return bool
698 function isCaseSensitive() {
699 return $this->mCaseSensitive;
703 * @return int
705 function getId() {
706 return $this->mId;
711 * Class for handling an array of magic words
712 * @ingroup Parser
714 class MagicWordArray {
715 /** @var array */
716 public $names = array();
718 /** @var array */
719 private $hash;
721 private $baseRegex;
723 private $regex;
725 /** @todo Unused? */
726 private $matches;
729 * @param array $names
731 function __construct( $names = array() ) {
732 $this->names = $names;
736 * Add a magic word by name
738 * @param string $name
740 public function add( $name ) {
741 $this->names[] = $name;
742 $this->hash = $this->baseRegex = $this->regex = null;
746 * Add a number of magic words by name
748 * @param array $names
750 public function addArray( $names ) {
751 $this->names = array_merge( $this->names, array_values( $names ) );
752 $this->hash = $this->baseRegex = $this->regex = null;
756 * Get a 2-d hashtable for this array
758 function getHash() {
759 if ( is_null( $this->hash ) ) {
760 global $wgContLang;
761 $this->hash = array( 0 => array(), 1 => array() );
762 foreach ( $this->names as $name ) {
763 $magic = MagicWord::get( $name );
764 $case = intval( $magic->isCaseSensitive() );
765 foreach ( $magic->getSynonyms() as $syn ) {
766 if ( !$case ) {
767 $syn = $wgContLang->lc( $syn );
769 $this->hash[$case][$syn] = $name;
773 return $this->hash;
777 * Get the base regex
779 function getBaseRegex() {
780 if ( is_null( $this->baseRegex ) ) {
781 $this->baseRegex = array( 0 => '', 1 => '' );
782 foreach ( $this->names as $name ) {
783 $magic = MagicWord::get( $name );
784 $case = intval( $magic->isCaseSensitive() );
785 foreach ( $magic->getSynonyms() as $i => $syn ) {
786 // Group name must start with a non-digit in PCRE 8.34+
787 $it = strtr( $i, '0123456789', 'abcdefghij' );
788 $group = "(?P<{$it}_{$name}>" . preg_quote( $syn, '/' ) . ')';
789 if ( $this->baseRegex[$case] === '' ) {
790 $this->baseRegex[$case] = $group;
791 } else {
792 $this->baseRegex[$case] .= '|' . $group;
797 return $this->baseRegex;
801 * Get an unanchored regex that does not match parameters
803 function getRegex() {
804 if ( is_null( $this->regex ) ) {
805 $base = $this->getBaseRegex();
806 $this->regex = array( '', '' );
807 if ( $this->baseRegex[0] !== '' ) {
808 $this->regex[0] = "/{$base[0]}/iuS";
810 if ( $this->baseRegex[1] !== '' ) {
811 $this->regex[1] = "/{$base[1]}/S";
814 return $this->regex;
818 * Get a regex for matching variables with parameters
820 * @return string
822 function getVariableRegex() {
823 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
827 * Get a regex anchored to the start of the string that does not match parameters
829 * @return array
831 function getRegexStart() {
832 $base = $this->getBaseRegex();
833 $newRegex = array( '', '' );
834 if ( $base[0] !== '' ) {
835 $newRegex[0] = "/^(?:{$base[0]})/iuS";
837 if ( $base[1] !== '' ) {
838 $newRegex[1] = "/^(?:{$base[1]})/S";
840 return $newRegex;
844 * Get an anchored regex for matching variables with parameters
846 * @return array
848 function getVariableStartToEndRegex() {
849 $base = $this->getBaseRegex();
850 $newRegex = array( '', '' );
851 if ( $base[0] !== '' ) {
852 $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
854 if ( $base[1] !== '' ) {
855 $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
857 return $newRegex;
861 * @since 1.20
862 * @return array
864 public function getNames() {
865 return $this->names;
869 * Parse a match array from preg_match
870 * Returns array(magic word ID, parameter value)
871 * If there is no parameter value, that element will be false.
873 * @param array $m
875 * @throws MWException
876 * @return array
878 function parseMatch( $m ) {
879 reset( $m );
880 while ( list( $key, $value ) = each( $m ) ) {
881 if ( $key === 0 || $value === '' ) {
882 continue;
884 $parts = explode( '_', $key, 2 );
885 if ( count( $parts ) != 2 ) {
886 // This shouldn't happen
887 // continue;
888 throw new MWException( __METHOD__ . ': bad parameter name' );
890 list( /* $synIndex */, $magicName ) = $parts;
891 $paramValue = next( $m );
892 return array( $magicName, $paramValue );
894 // This shouldn't happen either
895 throw new MWException( __METHOD__ . ': parameter not found' );
899 * Match some text, with parameter capture
900 * Returns an array with the magic word name in the first element and the
901 * parameter in the second element.
902 * Both elements are false if there was no match.
904 * @param string $text
906 * @return array
908 public function matchVariableStartToEnd( $text ) {
909 $regexes = $this->getVariableStartToEndRegex();
910 foreach ( $regexes as $regex ) {
911 if ( $regex !== '' ) {
912 $m = array();
913 if ( preg_match( $regex, $text, $m ) ) {
914 return $this->parseMatch( $m );
918 return array( false, false );
922 * Match some text, without parameter capture
923 * Returns the magic word name, or false if there was no capture
925 * @param string $text
927 * @return string|bool False on failure
929 public function matchStartToEnd( $text ) {
930 $hash = $this->getHash();
931 if ( isset( $hash[1][$text] ) ) {
932 return $hash[1][$text];
934 global $wgContLang;
935 $lc = $wgContLang->lc( $text );
936 if ( isset( $hash[0][$lc] ) ) {
937 return $hash[0][$lc];
939 return false;
943 * Returns an associative array, ID => param value, for all items that match
944 * Removes the matched items from the input string (passed by reference)
946 * @param string $text
948 * @return array
950 public function matchAndRemove( &$text ) {
951 $found = array();
952 $regexes = $this->getRegex();
953 foreach ( $regexes as $regex ) {
954 if ( $regex === '' ) {
955 continue;
957 preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
958 foreach ( $matches as $m ) {
959 list( $name, $param ) = $this->parseMatch( $m );
960 $found[$name] = $param;
962 $text = preg_replace( $regex, '', $text );
964 return $found;
968 * Return the ID of the magic word at the start of $text, and remove
969 * the prefix from $text.
970 * Return false if no match found and $text is not modified.
971 * Does not match parameters.
973 * @param string $text
975 * @return int|bool False on failure
977 public function matchStartAndRemove( &$text ) {
978 $regexes = $this->getRegexStart();
979 foreach ( $regexes as $regex ) {
980 if ( $regex === '' ) {
981 continue;
983 if ( preg_match( $regex, $text, $m ) ) {
984 list( $id, ) = $this->parseMatch( $m );
985 if ( strlen( $m[0] ) >= strlen( $text ) ) {
986 $text = '';
987 } else {
988 $text = substr( $text, strlen( $m[0] ) );
990 return $id;
993 return false;