includes/libs/StringUtils.php

   1 <?php
   2
   3 use MediaWiki\Libs\UnpackFailedException;
   4 use Wikimedia\Assert\Assert;
   5 use Wikimedia\AtEase\AtEase;
   6
   7 /**
   8  * Methods to play with strings.
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License along
  21  * with this program; if not, write to the Free Software Foundation, Inc.,
  22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  23  * http://www.gnu.org/copyleft/gpl.html
  24  *
  25  * @file
  26  */
  27
  28 /**
  29  * A collection of static methods to play with strings.
  30  */
  31 class StringUtils {
  32         /**
  33          * Test whether a string is valid UTF-8.
  34          *
  35          * The function check for invalid byte sequences, overlong encoding but
  36          * not for different normalisations.
  37          *
  38          * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  39          * In particular, the pure PHP code path did not in fact check for overlong forms.
  40          * Beware of this when backporting code to that version of MediaWiki.
  41          *
  42          * @since 1.21
  43          * @param string $value String to check
  44          * @return bool Whether the given $value is a valid UTF-8 encoded string
  45          */
  46         public static function isUtf8( $value ) {
  47                 return mb_check_encoding( (string)$value, 'UTF-8' );
  48         }
  49
  50         /**
  51          * Explode a string, but ignore any instances of the separator inside
  52          * the given start and end delimiters, which may optionally nest.
  53          * The delimiters are literal strings, not regular expressions.
  54          * @param string $startDelim Start delimiter
  55          * @param string $endDelim End delimiter
  56          * @param string $separator Separator string for the explode.
  57          * @param string $subject Subject string to explode.
  58          * @param bool $nested True iff the delimiters are allowed to nest.
  59          * @return ArrayIterator
  60          */
  61         public static function delimiterExplode( $startDelim, $endDelim, $separator,
  62                 $subject, $nested = false ) {
  63                 $inputPos = 0;
  64                 $lastPos = 0;
  65                 $depth = 0;
  66                 $encStart = preg_quote( $startDelim, '!' );
  67                 $encEnd = preg_quote( $endDelim, '!' );
  68                 $encSep = preg_quote( $separator, '!' );
  69                 $len = strlen( $subject );
  70                 $m = [];
  71                 $exploded = [];
  72                 while (
  73                         $inputPos < $len &&
  74                         preg_match(
  75                                 "!$encStart|$encEnd|$encSep!S", $subject, $m,
  76                                 PREG_OFFSET_CAPTURE, $inputPos
  77                         )
  78                 ) {
  79                         $match = $m[0][0];
  80                         $matchPos = $m[0][1];
  81                         $inputPos = $matchPos + strlen( $match );
  82                         if ( $match === $separator ) {
  83                                 if ( $depth === 0 ) {
  84                                         $exploded[] = substr(
  85                                                 $subject, $lastPos, $matchPos - $lastPos
  86                                         );
  87                                         $lastPos = $inputPos;
  88                                 }
  89                         } elseif ( $match === $startDelim ) {
  90                                 if ( $depth === 0 || $nested ) {
  91                                         $depth++;
  92                                 }
  93                         } else {
  94                                 $depth--;
  95                         }
  96                 }
  97                 $exploded[] = substr( $subject, $lastPos );
  98                 // This method could be rewritten in the future to avoid creating an
  99                 // intermediate array, since the return type is just an iterator.
 100                 return new ArrayIterator( $exploded );
 101         }
 102
 103         /**
 104          * Perform an operation equivalent to `preg_replace()`
 105          *
 106          * Matches this code:
 107          *
 108          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 109          *
 110          * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
 111          * implementation is fast but memory-hungry and inflexible. The memory requirements are such
 112          * that I don't recommend using it on anything but guaranteed small chunks of text.
 113          *
 114          * @param string $startDelim
 115          * @param string $endDelim
 116          * @param string $replace
 117          * @param string $subject
 118          * @return string
 119          */
 120         public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 121                 $segments = explode( $startDelim, $subject );
 122                 $output = array_shift( $segments );
 123                 foreach ( $segments as $s ) {
 124                         $endDelimPos = strpos( $s, $endDelim );
 125                         if ( $endDelimPos === false ) {
 126                                 $output .= $startDelim . $s;
 127                         } else {
 128                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 129                         }
 130                 }
 131
 132                 return $output;
 133         }
 134
 135         /**
 136          * Perform an operation equivalent to `preg_replace_callback()`
 137          *
 138          * Matches this code:
 139          *
 140          *     preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
 141          *
 142          * If the start delimiter ends with an initial substring of the end delimiter,
 143          * e.g. in the case of C-style comments, the behavior differs from the model
 144          * regex. In this implementation, the end must share no characters with the
 145          * start, so e.g. `/*\/` is not considered to be both the start and end of a
 146          * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
 147          *
 148          * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
 149          * but uses far less memory. The delimiters are literal strings, not regular expressions.
 150          *
 151          * @param string $startDelim Start delimiter
 152          * @param string $endDelim End delimiter
 153          * @param callable $callback Function to call on each match
 154          * @param string $subject
 155          * @param string $flags Regular expression flags
 156          * @return string
 157          */
 158         private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
 159                 $subject, $flags = ''
 160         ) {
 161                 $inputPos = 0;
 162                 $outputPos = 0;
 163                 $contentPos = 0;
 164                 $output = '';
 165                 $foundStart = false;
 166                 $encStart = preg_quote( $startDelim, '!' );
 167                 $encEnd = preg_quote( $endDelim, '!' );
 168                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 169                 $endLength = strlen( $endDelim );
 170                 $m = [];
 171
 172                 while ( $inputPos < strlen( $subject ) &&
 173                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 174                 ) {
 175                         $tokenOffset = $m[0][1];
 176                         if ( $m[1][0] != '' ) {
 177                                 if ( $foundStart &&
 178                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 179                                 ) {
 180                                         # An end match is present at the same location
 181                                         $tokenType = 'end';
 182                                         $tokenLength = $endLength;
 183                                 } else {
 184                                         $tokenType = 'start';
 185                                         $tokenLength = strlen( $m[0][0] );
 186                                 }
 187                         } elseif ( $m[2][0] != '' ) {
 188                                 $tokenType = 'end';
 189                                 $tokenLength = strlen( $m[0][0] );
 190                         } else {
 191                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 192                         }
 193
 194                         if ( $tokenType == 'start' ) {
 195                                 # Only move the start position if we haven't already found a start
 196                                 # This means that START START END matches outer pair
 197                                 if ( !$foundStart ) {
 198                                         # Found start
 199                                         $inputPos = $tokenOffset + $tokenLength;
 200                                         # Write out the non-matching section
 201                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 202                                         $outputPos = $tokenOffset;
 203                                         $contentPos = $inputPos;
 204                                         $foundStart = true;
 205                                 } else {
 206                                         # Move the input position past the *first character* of START,
 207                                         # to protect against missing END when it overlaps with START
 208                                         $inputPos = $tokenOffset + 1;
 209                                 }
 210                         } elseif ( $tokenType == 'end' ) {
 211                                 if ( $foundStart ) {
 212                                         # Found match
 213                                         $output .= $callback( [
 214                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 215                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 216                                         ] );
 217                                         $foundStart = false;
 218                                 } else {
 219                                         # Non-matching end, write it out
 220                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 221                                 }
 222                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 223                         } else {
 224                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 225                         }
 226                 }
 227                 if ( $outputPos < strlen( $subject ) ) {
 228                         $output .= substr( $subject, $outputPos );
 229                 }
 230
 231                 return $output;
 232         }
 233
 234         /**
 235          * Perform an operation equivalent to `preg_replace()` with flags.
 236          *
 237          * Matches this code:
 238          *
 239          *     preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
 240          *
 241          * @param string $startDelim Start delimiter regular expression
 242          * @param string $endDelim End delimiter regular expression
 243          * @param string $replace Replacement string. May contain $1, which will be
 244          *  replaced by the text between the delimiters
 245          * @param string $subject String to search
 246          * @param string $flags Regular expression flags
 247          * @return string The string with the matches replaced
 248          */
 249         public static function delimiterReplace(
 250                 $startDelim, $endDelim, $replace, $subject, $flags = ''
 251         ) {
 252                 return self::delimiterReplaceCallback(
 253                         $startDelim, $endDelim,
 254                         static function ( array $matches ) use ( $replace ) {
 255                                 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
 256                         },
 257                         $subject, $flags
 258                 );
 259         }
 260
 261         /**
 262          * More or less "markup-safe" str_replace()
 263          * Ignores any instances of the separator inside `<...>`
 264          * @param string $search
 265          * @param string $replace
 266          * @param string $text
 267          * @return string
 268          */
 269         public static function replaceMarkup( $search, $replace, $text ) {
 270                 $placeholder = "\x00";
 271
 272                 // Remove placeholder instances
 273                 $text = str_replace( $placeholder, '', $text );
 274
 275                 // Replace instances of the separator inside HTML-like tags with the placeholder
 276                 $cleaned = self::delimiterReplaceCallback(
 277                         '<', '>',
 278                         static function ( array $matches ) use ( $search, $placeholder ) {
 279                                 return str_replace( $search, $placeholder, $matches[0] );
 280                         },
 281                         $text
 282                 );
 283
 284                 // Explode, then put the replaced separators back in
 285                 $cleaned = str_replace( $search, $replace, $cleaned );
 286                 $text = str_replace( $placeholder, $search, $cleaned );
 287
 288                 return $text;
 289         }
 290
 291         /**
 292          * Utility function to check if the given string is a valid PCRE regex. Avoids
 293          * manually calling suppressWarnings and restoreWarnings, and provides a
 294          * one-line solution without the need to use @.
 295          *
 296          * @since 1.34
 297          * @param string $string The string you want to check being a valid regex
 298          * @return bool
 299          */
 300         public static function isValidPCRERegex( $string ) {
 301                 AtEase::suppressWarnings();
 302                 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
 303                 $isValid = preg_match( $string, '' );
 304                 AtEase::restoreWarnings();
 305                 return $isValid !== false;
 306         }
 307
 308         /**
 309          * Escape a string to make it suitable for inclusion in a preg_replace()
 310          * replacement parameter.
 311          *
 312          * @param string $string
 313          * @return string
 314          */
 315         public static function escapeRegexReplacement( $string ) {
 316                 $string = str_replace( '\\', '\\\\', $string );
 317                 return str_replace( '$', '\\$', $string );
 318         }
 319
 320         /**
 321          * Workalike for explode() with limited memory usage.
 322          *
 323          * @param string $separator
 324          * @param string $subject
 325          * @return ArrayIterator|ExplodeIterator
 326          */
 327         public static function explode( $separator, $subject ) {
 328                 if ( substr_count( $subject, $separator ) > 1000 ) {
 329                         return new ExplodeIterator( $separator, $subject );
 330                 } else {
 331                         return new ArrayIterator( explode( $separator, $subject ) );
 332                 }
 333         }
 334
 335         /**
 336          * Wrapper around php's unpack.
 337          *
 338          * @param string $format The format string (See php's docs)
 339          * @param string $data A binary string of binary data
 340          * @param int|false $length The minimum length of $data or false. This is to
 341          *      prevent reading beyond the end of $data. false to disable the check.
 342          *
 343          * Also be careful when using this function to read unsigned 32 bit integer
 344          * because php might make it negative.
 345          *
 346          * @throws UnpackFailedException If $data not long enough, or if unpack fails
 347          * @return array Associative array of the extracted data
 348          * @since 1.42
 349          */
 350         public static function unpack( string $format, string $data, $length = false ): array {
 351                 Assert::parameterType( [ 'integer', 'false' ], $length, '$length' );
 352                 if ( $length !== false ) {
 353                         $realLen = strlen( $data );
 354                         if ( $realLen < $length ) {
 355                                 throw new UnpackFailedException( "Tried to unpack a "
 356                                         . "string of length $realLen, but needed one "
 357                                         . "of at least length $length."
 358                                 );
 359                         }
 360                 }
 361
 362                 AtEase::suppressWarnings();
 363                 $result = unpack( $format, $data );
 364                 AtEase::restoreWarnings();
 365
 366                 if ( $result === false ) {
 367                         // If it cannot extract the packed data.
 368                         throw new UnpackFailedException( "unpack could not unpack binary data" );
 369                 }
 370                 return $result;
 371         }
 372 }