includes/StringUtils.php

   1 <?php
   2 /**
   3  * Methods to play with strings.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  */
  22
  23 /**
  24  * A collection of static methods to play with strings.
  25  */
  26 class StringUtils {
  27
  28         /**
  29          * Test whether a string is valid UTF-8.
  30          *
  31          * The function check for invalid byte sequences, overlong encoding but
  32          * not for different normalisations.
  33          *
  34          * This relies internally on the mbstring function mb_check_encoding()
  35          * hardcoded to check against UTF-8. Whenever the function is not available
  36          * we fallback to a pure PHP implementation. Setting $disableMbstring to
  37          * true will skip the use of mb_check_encoding, this is mostly intended for
  38          * unit testing our internal implementation.
  39          *
  40          * @since 1.21
  41          *
  42          * @param string $value String to check
  43          * @param boolean $disableMbstring Whether to use the pure PHP
  44          * implementation instead of trying mb_check_encoding. Intended for unit
  45          * testing. Default: false
  46          *
  47          * @return boolean Whether the given $value is a valid UTF-8 encoded string
  48          */
  49         static function isUtf8( $value, $disableMbstring = false ) {
  50
  51                 if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) {
  52                         # no high bit set, this is pure ASCII which is de facto
  53                         # valid UTF-8
  54                         return true;
  55                 }
  56
  57                 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
  58                         return mb_check_encoding( $value, 'UTF-8' );
  59                 } else {
  60                         $hasUtf8 = preg_match( '/^(?>
  61                                   [\x00-\x7f]
  62                                 | [\xc0-\xdf][\x80-\xbf]
  63                                 | [\xe0-\xef][\x80-\xbf]{2}
  64                                 | [\xf0-\xf7][\x80-\xbf]{3}
  65                                 | [\xf8-\xfb][\x80-\xbf]{4}
  66                                 | \xfc[\x84-\xbf][\x80-\xbf]{4}
  67                         )+$/x', $value );
  68                         return ( $hasUtf8 > 0 );
  69                 }
  70         }
  71
  72         /**
  73          * Perform an operation equivalent to
  74          *
  75          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
  76          *
  77          * except that it's worst-case O(N) instead of O(N^2)
  78          *
  79          * Compared to delimiterReplace(), this implementation is fast but memory-
  80          * hungry and inflexible. The memory requirements are such that I don't
  81          * recommend using it on anything but guaranteed small chunks of text.
  82          *
  83          * @param $startDelim
  84          * @param $endDelim
  85          * @param $replace
  86          * @param $subject
  87          *
  88          * @return string
  89          */
  90         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
  91                 $segments = explode( $startDelim, $subject );
  92                 $output = array_shift( $segments );
  93                 foreach ( $segments as $s ) {
  94                         $endDelimPos = strpos( $s, $endDelim );
  95                         if ( $endDelimPos === false ) {
  96                                 $output .= $startDelim . $s;
  97                         } else {
  98                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
  99                         }
 100                 }
 101                 return $output;
 102         }
 103
 104         /**
 105          * Perform an operation equivalent to
 106          *
 107          *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
 108          *
 109          * This implementation is slower than hungryDelimiterReplace but uses far less
 110          * memory. The delimiters are literal strings, not regular expressions.
 111          *
 112          * If the start delimiter ends with an initial substring of the end delimiter,
 113          * e.g. in the case of C-style comments, the behavior differs from the model
 114          * regex. In this implementation, the end must share no characters with the
 115          * start, so e.g. /*\/ is not considered to be both the start and end of a
 116          * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
 117          *
 118          * @param string $startDelim start delimiter
 119          * @param string $endDelim end delimiter
 120          * @param $callback Callback: function to call on each match
 121          * @param $subject String
 122          * @param string $flags regular expression flags
 123          * @throws MWException
 124          * @return string
 125          */
 126         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
 127                 $inputPos = 0;
 128                 $outputPos = 0;
 129                 $output = '';
 130                 $foundStart = false;
 131                 $encStart = preg_quote( $startDelim, '!' );
 132                 $encEnd = preg_quote( $endDelim, '!' );
 133                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 134                 $endLength = strlen( $endDelim );
 135                 $m = array();
 136
 137                 while ( $inputPos < strlen( $subject ) &&
 138                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
 139                 {
 140                         $tokenOffset = $m[0][1];
 141                         if ( $m[1][0] != '' ) {
 142                                 if ( $foundStart &&
 143                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
 144                                 {
 145                                         # An end match is present at the same location
 146                                         $tokenType = 'end';
 147                                         $tokenLength = $endLength;
 148                                 } else {
 149                                         $tokenType = 'start';
 150                                         $tokenLength = strlen( $m[0][0] );
 151                                 }
 152                         } elseif ( $m[2][0] != '' ) {
 153                                 $tokenType = 'end';
 154                                 $tokenLength = strlen( $m[0][0] );
 155                         } else {
 156                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 157                         }
 158
 159                         if ( $tokenType == 'start' ) {
 160                                 # Only move the start position if we haven't already found a start
 161                                 # This means that START START END matches outer pair
 162                                 if ( !$foundStart ) {
 163                                         # Found start
 164                                         $inputPos = $tokenOffset + $tokenLength;
 165                                         # Write out the non-matching section
 166                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 167                                         $outputPos = $tokenOffset;
 168                                         $contentPos = $inputPos;
 169                                         $foundStart = true;
 170                                 } else {
 171                                         # Move the input position past the *first character* of START,
 172                                         # to protect against missing END when it overlaps with START
 173                                         $inputPos = $tokenOffset + 1;
 174                                 }
 175                         } elseif ( $tokenType == 'end' ) {
 176                                 if ( $foundStart ) {
 177                                         # Found match
 178                                         $output .= call_user_func( $callback, array(
 179                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 180                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 181                                         ));
 182                                         $foundStart = false;
 183                                 } else {
 184                                         # Non-matching end, write it out
 185                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 186                                 }
 187                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 188                         } else {
 189                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 190                         }
 191                 }
 192                 if ( $outputPos < strlen( $subject ) ) {
 193                         $output .= substr( $subject, $outputPos );
 194                 }
 195                 return $output;
 196         }
 197
 198         /**
 199          * Perform an operation equivalent to
 200          *
 201          *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
 202          *
 203          * @param string $startDelim start delimiter regular expression
 204          * @param string $endDelim end delimiter regular expression
 205          * @param string $replace replacement string. May contain $1, which will be
 206          *                 replaced by the text between the delimiters
 207          * @param string $subject to search
 208          * @param string $flags regular expression flags
 209          * @return String: The string with the matches replaced
 210          */
 211         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 212                 $replacer = new RegexlikeReplacer( $replace );
 213                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
 214                         $replacer->cb(), $subject, $flags );
 215         }
 216
 217         /**
 218          * More or less "markup-safe" explode()
 219          * Ignores any instances of the separator inside <...>
 220          * @param $separator String
 221          * @param $text String
 222          * @return array
 223          */
 224         static function explodeMarkup( $separator, $text ) {
 225                 $placeholder = "\x00";
 226
 227                 // Remove placeholder instances
 228                 $text = str_replace( $placeholder, '', $text );
 229
 230                 // Replace instances of the separator inside HTML-like tags with the placeholder
 231                 $replacer = new DoubleReplacer( $separator, $placeholder );
 232                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 233
 234                 // Explode, then put the replaced separators back in
 235                 $items = explode( $separator, $cleaned );
 236                 foreach ( $items as $i => $str ) {
 237                         $items[$i] = str_replace( $placeholder, $separator, $str );
 238                 }
 239
 240                 return $items;
 241         }
 242
 243         /**
 244          * Escape a string to make it suitable for inclusion in a preg_replace()
 245          * replacement parameter.
 246          *
 247          * @param $string String
 248          * @return String
 249          */
 250         static function escapeRegexReplacement( $string ) {
 251                 $string = str_replace( '\\', '\\\\', $string );
 252                 $string = str_replace( '$', '\\$', $string );
 253                 return $string;
 254         }
 255
 256         /**
 257          * Workalike for explode() with limited memory usage.
 258          * Returns an Iterator
 259          * @param $separator
 260          * @param $subject
 261          * @return ArrayIterator|ExplodeIterator
 262          */
 263         static function explode( $separator, $subject ) {
 264                 if ( substr_count( $subject, $separator ) > 1000 ) {
 265                         return new ExplodeIterator( $separator, $subject );
 266                 } else {
 267                         return new ArrayIterator( explode( $separator, $subject ) );
 268                 }
 269         }
 270 }
 271
 272 /**
 273  * Base class for "replacers", objects used in preg_replace_callback() and
 274  * StringUtils::delimiterReplaceCallback()
 275  */
 276 class Replacer {
 277
 278         /**
 279          * @return array
 280          */
 281         function cb() {
 282                 return array( &$this, 'replace' );
 283         }
 284 }
 285
 286 /**
 287  * Class to replace regex matches with a string similar to that used in preg_replace()
 288  */
 289 class RegexlikeReplacer extends Replacer {
 290         var $r;
 291
 292         /**
 293          * @param $r string
 294          */
 295         function __construct( $r ) {
 296                 $this->r = $r;
 297         }
 298
 299         /**
 300          * @param $matches array
 301          * @return string
 302          */
 303         function replace( $matches ) {
 304                 $pairs = array();
 305                 foreach ( $matches as $i => $match ) {
 306                         $pairs["\$$i"] = $match;
 307                 }
 308                 return strtr( $this->r, $pairs );
 309         }
 310
 311 }
 312
 313 /**
 314  * Class to perform secondary replacement within each replacement string
 315  */
 316 class DoubleReplacer extends Replacer {
 317
 318         /**
 319          * @param $from
 320          * @param $to
 321          * @param $index int
 322          */
 323         function __construct( $from, $to, $index = 0 ) {
 324                 $this->from = $from;
 325                 $this->to = $to;
 326                 $this->index = $index;
 327         }
 328
 329         /**
 330          * @param $matches array
 331          * @return mixed
 332          */
 333         function replace( $matches ) {
 334                 return str_replace( $this->from, $this->to, $matches[$this->index] );
 335         }
 336 }
 337
 338 /**
 339  * Class to perform replacement based on a simple hashtable lookup
 340  */
 341 class HashtableReplacer extends Replacer {
 342         var $table, $index;
 343
 344         /**
 345          * @param $table
 346          * @param $index int
 347          */
 348         function __construct( $table, $index = 0 ) {
 349                 $this->table = $table;
 350                 $this->index = $index;
 351         }
 352
 353         /**
 354          * @param $matches array
 355          * @return mixed
 356          */
 357         function replace( $matches ) {
 358                 return $this->table[$matches[$this->index]];
 359         }
 360 }
 361
 362 /**
 363  * Replacement array for FSS with fallback to strtr()
 364  * Supports lazy initialisation of FSS resource
 365  */
 366 class ReplacementArray {
 367         /*mostly private*/ var $data = false;
 368         /*mostly private*/ var $fss = false;
 369
 370         /**
 371          * Create an object with the specified replacement array
 372          * The array should have the same form as the replacement array for strtr()
 373          * @param array $data
 374          */
 375         function __construct( $data = array() ) {
 376                 $this->data = $data;
 377         }
 378
 379         /**
 380          * @return array
 381          */
 382         function __sleep() {
 383                 return array( 'data' );
 384         }
 385
 386         function __wakeup() {
 387                 $this->fss = false;
 388         }
 389
 390         /**
 391          * Set the whole replacement array at once
 392          */
 393         function setArray( $data ) {
 394                 $this->data = $data;
 395                 $this->fss = false;
 396         }
 397
 398         /**
 399          * @return array|bool
 400          */
 401         function getArray() {
 402                 return $this->data;
 403         }
 404
 405         /**
 406          * Set an element of the replacement array
 407          * @param $from string
 408          * @param $to string
 409          */
 410         function setPair( $from, $to ) {
 411                 $this->data[$from] = $to;
 412                 $this->fss = false;
 413         }
 414
 415         /**
 416          * @param $data array
 417          */
 418         function mergeArray( $data ) {
 419                 $this->data = array_merge( $this->data, $data );
 420                 $this->fss = false;
 421         }
 422
 423         /**
 424          * @param $other
 425          */
 426         function merge( $other ) {
 427                 $this->data = array_merge( $this->data, $other->data );
 428                 $this->fss = false;
 429         }
 430
 431         /**
 432          * @param $from string
 433          */
 434         function removePair( $from ) {
 435                 unset( $this->data[$from] );
 436                 $this->fss = false;
 437         }
 438
 439         /**
 440          * @param $data array
 441          */
 442         function removeArray( $data ) {
 443                 foreach ( $data as $from => $to ) {
 444                         $this->removePair( $from );
 445                 }
 446                 $this->fss = false;
 447         }
 448
 449         /**
 450          * @param $subject string
 451          * @return string
 452          */
 453         function replace( $subject ) {
 454                 if ( function_exists( 'fss_prep_replace' ) ) {
 455                         wfProfileIn( __METHOD__ . '-fss' );
 456                         if ( $this->fss === false ) {
 457                                 $this->fss = fss_prep_replace( $this->data );
 458                         }
 459                         $result = fss_exec_replace( $this->fss, $subject );
 460                         wfProfileOut( __METHOD__ . '-fss' );
 461                 } else {
 462                         wfProfileIn( __METHOD__ . '-strtr' );
 463                         $result = strtr( $subject, $this->data );
 464                         wfProfileOut( __METHOD__ . '-strtr' );
 465                 }
 466                 return $result;
 467         }
 468 }
 469
 470 /**
 471  * An iterator which works exactly like:
 472  *
 473  * foreach ( explode( $delim, $s ) as $element ) {
 474  *    ...
 475  * }
 476  *
 477  * Except it doesn't use 193 byte per element
 478  */
 479 class ExplodeIterator implements Iterator {
 480         // The subject string
 481         var $subject, $subjectLength;
 482
 483         // The delimiter
 484         var $delim, $delimLength;
 485
 486         // The position of the start of the line
 487         var $curPos;
 488
 489         // The position after the end of the next delimiter
 490         var $endPos;
 491
 492         // The current token
 493         var $current;
 494
 495         /**
 496          * Construct a DelimIterator
 497          * @param $delim string
 498          * @param $s string
 499          */
 500         function __construct( $delim, $s ) {
 501                 $this->subject = $s;
 502                 $this->delim = $delim;
 503
 504                 // Micro-optimisation (theoretical)
 505                 $this->subjectLength = strlen( $s );
 506                 $this->delimLength = strlen( $delim );
 507
 508                 $this->rewind();
 509         }
 510
 511         function rewind() {
 512                 $this->curPos = 0;
 513                 $this->endPos = strpos( $this->subject, $this->delim );
 514                 $this->refreshCurrent();
 515         }
 516
 517         function refreshCurrent() {
 518                 if ( $this->curPos === false ) {
 519                         $this->current = false;
 520                 } elseif ( $this->curPos >= $this->subjectLength ) {
 521                         $this->current = '';
 522                 } elseif ( $this->endPos === false ) {
 523                         $this->current = substr( $this->subject, $this->curPos );
 524                 } else {
 525                         $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
 526                 }
 527         }
 528
 529         function current() {
 530                 return $this->current;
 531         }
 532
 533         function key() {
 534                 return $this->curPos;
 535         }
 536
 537         /**
 538          * @return string
 539          */
 540         function next() {
 541                 if ( $this->endPos === false ) {
 542                         $this->curPos = false;
 543                 } else {
 544                         $this->curPos = $this->endPos + $this->delimLength;
 545                         if ( $this->curPos >= $this->subjectLength ) {
 546                                 $this->endPos = false;
 547                         } else {
 548                                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
 549                         }
 550                 }
 551                 $this->refreshCurrent();
 552                 return $this->current;
 553         }
 554
 555         /**
 556          * @return bool
 557          */
 558         function valid() {
 559                 return $this->curPos !== false;
 560         }
 561 }