includes/libs/StringUtils.php

   1 <?php
   2 /**
   3  * Methods to play with strings.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  */
  22
  23 /**
  24  * A collection of static methods to play with strings.
  25  */
  26 class StringUtils {
  27         /**
  28          * Test whether a string is valid UTF-8.
  29          *
  30          * The function check for invalid byte sequences, overlong encoding but
  31          * not for different normalisations.
  32          *
  33          * This relies internally on the mbstring function mb_check_encoding()
  34          * hardcoded to check against UTF-8. Whenever the function is not available
  35          * we fallback to a pure PHP implementation. Setting $disableMbstring to
  36          * true will skip the use of mb_check_encoding, this is mostly intended for
  37          * unit testing our internal implementation.
  38          *
  39          * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  40          * In particular, the pure PHP code path did not in fact check for overlong forms.
  41          * Beware of this when backporting code to that version of MediaWiki.
  42          *
  43          * @since 1.21
  44          * @param string $value String to check
  45          * @param bool $disableMbstring Whether to use the pure PHP
  46          *  implementation instead of trying mb_check_encoding. Intended for unit
  47          *  testing. Default: false
  48          * @return bool Whether the given $value is a valid UTF-8 encoded string
  49          */
  50         static function isUtf8( $value, $disableMbstring = false ) {
  51                 $value = (string)$value;
  52
  53                 // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
  54                 // U+10FFFF are incorrectly allowed, so we have to check for them separately.
  55                 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
  56                         static $newPHP;
  57                         if ( $newPHP === null ) {
  58                                 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
  59                         }
  60
  61                         return mb_check_encoding( $value, 'UTF-8' ) &&
  62                                 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
  63                 }
  64
  65                 if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
  66                         // String contains only ASCII characters, has to be valid
  67                         return true;
  68                 }
  69
  70                 // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
  71                 // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
  72                 // sequences, which can be as long as the input string is. Multiple short regexes are
  73                 // used rather than a single long regex for performance.
  74                 static $regexes;
  75                 if ( $regexes === null ) {
  76                         $cont = "[\x80-\xbf]";
  77                         $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
  78                         $regexes = array(
  79                                 // Continuation byte at the start
  80                                 "/^$cont/",
  81
  82                                 // ASCII byte followed by a continuation byte
  83                                 "/[\\x00-\x7f]$cont/S",
  84
  85                                 // Illegal byte
  86                                 "/[\xc0\xc1\xf5-\xff]/S",
  87
  88                                 // Invalid 2-byte sequence, or valid one then an extra continuation byte
  89                                 "/[\xc2-\xdf](?!$cont$after)/S",
  90
  91                                 // Invalid 3-byte sequence, or valid one then an extra continuation byte
  92                                 "/\xe0(?![\xa0-\xbf]$cont$after)/",
  93                                 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
  94                                 "/\xed(?![\x80-\x9f]$cont$after)/",
  95
  96                                 // Invalid 4-byte sequence, or valid one then an extra continuation byte
  97                                 "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
  98                                 "/[\xf1-\xf3](?!$cont{3}$after)/S",
  99                                 "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
 100                         );
 101                 }
 102
 103                 foreach ( $regexes as $regex ) {
 104                         if ( preg_match( $regex, $value ) !== 0 ) {
 105                                 return false;
 106                         }
 107                 }
 108
 109                 return true;
 110         }
 111
 112         /**
 113          * Perform an operation equivalent to `preg_replace()`
 114          *
 115          * Matches this code:
 116          *
 117          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 118          *
 119          * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
 120          * implementation is fast but memory-hungry and inflexible. The memory requirements are such
 121          * that I don't recommend using it on anything but guaranteed small chunks of text.
 122          *
 123          * @param string $startDelim
 124          * @param string $endDelim
 125          * @param string $replace
 126          * @param string $subject
 127          * @return string
 128          */
 129         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 130                 $segments = explode( $startDelim, $subject );
 131                 $output = array_shift( $segments );
 132                 foreach ( $segments as $s ) {
 133                         $endDelimPos = strpos( $s, $endDelim );
 134                         if ( $endDelimPos === false ) {
 135                                 $output .= $startDelim . $s;
 136                         } else {
 137                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 138                         }
 139                 }
 140
 141                 return $output;
 142         }
 143
 144         /**
 145          * Perform an operation equivalent to `preg_replace_callback()`
 146          *
 147          * Matches this code:
 148          *
 149          *     preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
 150          *
 151          * If the start delimiter ends with an initial substring of the end delimiter,
 152          * e.g. in the case of C-style comments, the behavior differs from the model
 153          * regex. In this implementation, the end must share no characters with the
 154          * start, so e.g. `/*\/` is not considered to be both the start and end of a
 155          * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
 156          *
 157          * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
 158          * but uses far less memory. The delimiters are literal strings, not regular expressions.
 159          *
 160          * @param string $startDelim Start delimiter
 161          * @param string $endDelim End delimiter
 162          * @param callable $callback Function to call on each match
 163          * @param string $subject
 164          * @param string $flags Regular expression flags
 165          * @throws InvalidArgumentException
 166          * @return string
 167          */
 168         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
 169                 $subject, $flags = ''
 170         ) {
 171                 $inputPos = 0;
 172                 $outputPos = 0;
 173                 $output = '';
 174                 $foundStart = false;
 175                 $encStart = preg_quote( $startDelim, '!' );
 176                 $encEnd = preg_quote( $endDelim, '!' );
 177                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 178                 $endLength = strlen( $endDelim );
 179                 $m = array();
 180
 181                 while ( $inputPos < strlen( $subject ) &&
 182                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 183                 ) {
 184                         $tokenOffset = $m[0][1];
 185                         if ( $m[1][0] != '' ) {
 186                                 if ( $foundStart &&
 187                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 188                                 ) {
 189                                         # An end match is present at the same location
 190                                         $tokenType = 'end';
 191                                         $tokenLength = $endLength;
 192                                 } else {
 193                                         $tokenType = 'start';
 194                                         $tokenLength = strlen( $m[0][0] );
 195                                 }
 196                         } elseif ( $m[2][0] != '' ) {
 197                                 $tokenType = 'end';
 198                                 $tokenLength = strlen( $m[0][0] );
 199                         } else {
 200                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 201                         }
 202
 203                         if ( $tokenType == 'start' ) {
 204                                 # Only move the start position if we haven't already found a start
 205                                 # This means that START START END matches outer pair
 206                                 if ( !$foundStart ) {
 207                                         # Found start
 208                                         $inputPos = $tokenOffset + $tokenLength;
 209                                         # Write out the non-matching section
 210                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 211                                         $outputPos = $tokenOffset;
 212                                         $contentPos = $inputPos;
 213                                         $foundStart = true;
 214                                 } else {
 215                                         # Move the input position past the *first character* of START,
 216                                         # to protect against missing END when it overlaps with START
 217                                         $inputPos = $tokenOffset + 1;
 218                                 }
 219                         } elseif ( $tokenType == 'end' ) {
 220                                 if ( $foundStart ) {
 221                                         # Found match
 222                                         $output .= call_user_func( $callback, array(
 223                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 224                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 225                                         ) );
 226                                         $foundStart = false;
 227                                 } else {
 228                                         # Non-matching end, write it out
 229                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 230                                 }
 231                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 232                         } else {
 233                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 234                         }
 235                 }
 236                 if ( $outputPos < strlen( $subject ) ) {
 237                         $output .= substr( $subject, $outputPos );
 238                 }
 239
 240                 return $output;
 241         }
 242
 243         /**
 244          * Perform an operation equivalent to `preg_replace()` with flags.
 245          *
 246          * Matches this code:
 247          *
 248          *     preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
 249          *
 250          * @param string $startDelim Start delimiter regular expression
 251          * @param string $endDelim End delimiter regular expression
 252          * @param string $replace Replacement string. May contain $1, which will be
 253          *  replaced by the text between the delimiters
 254          * @param string $subject String to search
 255          * @param string $flags Regular expression flags
 256          * @return string The string with the matches replaced
 257          */
 258         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 259                 $replacer = new RegexlikeReplacer( $replace );
 260
 261                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
 262                         $replacer->cb(), $subject, $flags );
 263         }
 264
 265         /**
 266          * More or less "markup-safe" explode()
 267          * Ignores any instances of the separator inside `<...>`
 268          * @param string $separator
 269          * @param string $text
 270          * @return array
 271          */
 272         static function explodeMarkup( $separator, $text ) {
 273                 $placeholder = "\x00";
 274
 275                 // Remove placeholder instances
 276                 $text = str_replace( $placeholder, '', $text );
 277
 278                 // Replace instances of the separator inside HTML-like tags with the placeholder
 279                 $replacer = new DoubleReplacer( $separator, $placeholder );
 280                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 281
 282                 // Explode, then put the replaced separators back in
 283                 $items = explode( $separator, $cleaned );
 284                 foreach ( $items as $i => $str ) {
 285                         $items[$i] = str_replace( $placeholder, $separator, $str );
 286                 }
 287
 288                 return $items;
 289         }
 290
 291         /**
 292          * Escape a string to make it suitable for inclusion in a preg_replace()
 293          * replacement parameter.
 294          *
 295          * @param string $string
 296          * @return string
 297          */
 298         static function escapeRegexReplacement( $string ) {
 299                 $string = str_replace( '\\', '\\\\', $string );
 300                 $string = str_replace( '$', '\\$', $string );
 301                 return $string;
 302         }
 303
 304         /**
 305          * Workalike for explode() with limited memory usage.
 306          *
 307          * @param string $separator
 308          * @param string $subject
 309          * @return ArrayIterator|ExplodeIterator
 310          */
 311         static function explode( $separator, $subject ) {
 312                 if ( substr_count( $subject, $separator ) > 1000 ) {
 313                         return new ExplodeIterator( $separator, $subject );
 314                 } else {
 315                         return new ArrayIterator( explode( $separator, $subject ) );
 316                 }
 317         }
 318 }