includes/normal/UtfNormal.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 /**
  21  * Unicode normalization routines for working with UTF-8 strings.
  22  * Currently assumes that input strings are valid UTF-8!
  23  *
  24  * Not as fast as I'd like, but should be usable for most purposes.
  25  * UtfNormal::toNFC() will bail early if given ASCII text or text
  26  * it can quickly deterimine is already normalized.
  27  *
  28  * All functions can be called static.
  29  *
  30  * See description of forms at http://www.unicode.org/reports/tr15/
  31  *
  32  * @package MediaWiki
  33  */
  34
  35 /** */
  36 require_once 'UtfNormalUtil.php';
  37 require_once 'UtfNormalData.inc';
  38
  39 # Load compatibility decompositions on demand if they are needed.
  40 global $utfCompatibilityDecomp;
  41 $utfCompatibilityDecomp = NULL;
  42
  43 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
  44 define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
  45
  46 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
  47 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
  48 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
  49
  50 define( 'UNICODE_HANGUL_LCOUNT', 19 );
  51 define( 'UNICODE_HANGUL_VCOUNT', 21 );
  52 define( 'UNICODE_HANGUL_TCOUNT', 28 );
  53 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
  54
  55 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
  56 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
  57 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
  58
  59 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
  60 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
  61 define( 'UNICODE_MAX', 0x10ffff );
  62 define( 'UNICODE_REPLACEMENT', 0xfffd );
  63
  64
  65 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
  66 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
  67
  68 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
  69 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
  70 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
  71
  72 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
  73 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
  74 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
  75
  76 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
  77 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
  78 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
  79 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
  80 #define( 'UTF8_REPLACEMENT', '!' );
  81
  82 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
  83 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
  84 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
  85
  86 # These two ranges are illegal
  87 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
  88 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
  89 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
  90 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  91
  92 define( 'UTF8_HEAD', false );
  93 define( 'UTF8_TAIL', true );
  94
  95 /**
  96  *
  97  * @package MediaWiki
  98  */
  99 class UtfNormal {
 100         /**
 101          * The ultimate convenience function! Clean up invalid UTF-8 sequences,
 102          * and convert to normal form C, canonical composition.
 103          *
 104          * Fast return for pure ASCII strings; some lesser optimizations for
 105          * strings containing only known-good characters. Not as fast as toNFC().
 106          *
 107          * @param string $string a UTF-8 string
 108          * @return string a clean, shiny, normalized UTF-8 string
 109          */
 110         function cleanUp( $string ) {
 111                 if( UtfNormal::quickIsNFCVerify( $string ) )
 112                         return $string;
 113                 else
 114                         return UtfNormal::NFC( $string );
 115         }
 116
 117         /**
 118          * Convert a UTF-8 string to normal form C, canonical composition.
 119          * Fast return for pure ASCII strings; some lesser optimizations for
 120          * strings containing only known-good characters.
 121          *
 122          * @param string $string a valid UTF-8 string. Input is not validated.
 123          * @return string a UTF-8 string in normal form C
 124          */
 125         function toNFC( $string ) {
 126                 if( UtfNormal::quickIsNFC( $string ) )
 127                         return $string;
 128                 else
 129                         return UtfNormal::NFC( $string );
 130         }
 131
 132         /**
 133          * Convert a UTF-8 string to normal form D, canonical decomposition.
 134          * Fast return for pure ASCII strings.
 135          *
 136          * @param string $string a valid UTF-8 string. Input is not validated.
 137          * @return string a UTF-8 string in normal form D
 138          */
 139         function toNFD( $string ) {
 140                 if( preg_match( '/[\x80-\xff]/', $string ) )
 141                         return UtfNormal::NFD( $string );
 142                 else
 143                         return $string;
 144         }
 145
 146         /**
 147          * Convert a UTF-8 string to normal form KC, compatibility composition.
 148          * This may cause irreversible information loss, use judiciously.
 149          * Fast return for pure ASCII strings.
 150          *
 151          * @param string $string a valid UTF-8 string. Input is not validated.
 152          * @return string a UTF-8 string in normal form KC
 153          */
 154         function toNFKC( $string ) {
 155                 if( preg_match( '/[\x80-\xff]/', $string ) )
 156                         return UtfNormal::NFKC( $string );
 157                 else
 158                         return $string;
 159         }
 160
 161         /**
 162          * Convert a UTF-8 string to normal form KD, compatibility decomposition.
 163          * This may cause irreversible information loss, use judiciously.
 164          * Fast return for pure ASCII strings.
 165          *
 166          * @param string $string a valid UTF-8 string. Input is not validated.
 167          * @return string a UTF-8 string in normal form KD
 168          */
 169         function toNFKD( $string ) {
 170                 if( preg_match( '/[\x80-\xff]/', $string ) )
 171                         return UtfNormal::NFKD( $string );
 172                 else
 173                         return $string;
 174         }
 175
 176         /**
 177          * Returns true if the string is _definitely_ in NFC.
 178          * Returns false if not or uncertain.
 179          * @param string $string a valid UTF-8 string. Input is not validated.
 180          * @return bool
 181          */
 182         function quickIsNFC( $string ) {
 183                 # ASCII is always valid NFC!
 184                 # If it's pure ASCII, let it through.
 185                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 186
 187                 global $utfCheckNFC, $utfCombiningClass;
 188                 $len = strlen( $string );
 189                 for( $i = 0; $i < $len; $i++ ) {
 190                         $c = $string{$i};
 191                         $n = ord( $c );
 192                         if( $n < 0x80 ) {
 193                                 continue;
 194                         } elseif( $n >= 0xf0 ) {
 195                                 $c = substr( $string, $i, 4 );
 196                                 $i += 3;
 197                         } elseif( $n >= 0xe0 ) {
 198                                 $c = substr( $string, $i, 3 );
 199                                 $i += 2;
 200                         } elseif( $n >= 0xc0 ) {
 201                                 $c = substr( $string, $i, 2 );
 202                                 $i++;
 203                         }
 204                         if( isset( $utfCheckNFC[$c] ) ) {
 205                                 # If it's NO or MAYBE, bail and do the slow check.
 206                                 return false;
 207                         }
 208                         if( isset( $utfCombiningClass[$c] ) ) {
 209                                 # Combining character? We might have to do sorting, at least.
 210                                 return false;
 211                         }
 212                 }
 213                 return true;
 214         }
 215
 216         /**
 217          * Returns true if the string is _definitely_ in NFC.
 218          * Returns false if not or uncertain.
 219          * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
 220          * @return bool
 221          */
 222         function quickIsNFCVerify( &$string ) {
 223                 # ASCII is always valid NFC!
 224                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 225
 226                 global $utfCheckNFC, $utfCombiningClass;
 227                 $len = strlen( $string );
 228                 $out = '';
 229                 $state = UTF8_HEAD;
 230                 $looksNormal = true;
 231
 232                 $rep = false;
 233                 $head = 0;
 234                 for( $i = 0; $i < $len; $i++ ) {
 235                         $c = $string{$i};
 236                         $n = ord( $c );
 237                         if( $state == UTF8_TAIL ) {
 238                                 if( $n >= 0x80 && $n < 0xc0 ) {
 239                                         $sequence .= $c;
 240                                         if( --$remaining == 0 ) {
 241                                                 if( ($sequence >= UTF8_SURROGATE_FIRST
 242                                                                 && $sequence <= UTF8_SURROGATE_LAST)
 243                                                         || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
 244                                                         || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
 245                                                         || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 246                                                         || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 247                                                         || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
 248                                                         || ($sequence == UTF8_FFFE)
 249                                                         || ($sequence == UTF8_FFFF)
 250                                                         || ($sequence > UTF8_MAX) ) {
 251                                                         $out .= UTF8_REPLACEMENT;
 252                                                         $state = UTF8_HEAD;
 253                                                         continue;
 254                                                 }
 255                                                 if( isset( $utfCheckNFC[$sequence] ) ||
 256                                                         isset( $utfCombiningClass[$sequence] ) ) {
 257                                                         # If it's NO or MAYBE, we'll have to do the slow check.
 258                                                         $looksNormal = false;
 259                                                 }
 260                                                 $out .= $sequence;
 261                                                 $state = UTF8_HEAD;
 262                                                 $head = 0;
 263                                         }
 264                                         continue;
 265                                 }
 266                                 # Not a valid tail byte! DIscard the char we've been building.
 267                                 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 268                                 $state = UTF8_HEAD;
 269                                 $out .= UTF8_REPLACEMENT;
 270                         }
 271                         if( $n < 0x09 ) {
 272                                 $out .= UTF8_REPLACEMENT;
 273                         } elseif( $n == 0x0a ) {
 274                                 $out .= $c;
 275                         } elseif( $n < 0x0d ) {
 276                                 $out .= UTF8_REPLACEMENT;
 277                         } elseif( $n == 0x0d ) {
 278                                 # Strip \r silently
 279                         } elseif( $n < 0x20 ) {
 280                                 $out .= UTF8_REPLACEMENT;
 281                         } elseif( $n < 0x80 ) {
 282                                 $out .= $c;
 283                         } elseif( $n < 0xc0 ) {
 284                                 # illegal tail bytes or head byte of overlong sequence
 285                                 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
 286                         } elseif( $n < 0xe0 ) {
 287                                 $state = UTF8_TAIL;
 288                                 $remaining = 1;
 289                                 $sequence = $c;
 290                                 $head = $n;
 291                         } elseif( $n < 0xf0 ) {
 292                                 $state = UTF8_TAIL;
 293                                 $remaining = 2;
 294                                 $sequence = $c;
 295                                 $head = $n;
 296                         } elseif( $n < 0xf8 ) {
 297                                 $state = UTF8_TAIL;
 298                                 $remaining = 3;
 299                                 $sequence = $c;
 300                                 $head = $n;
 301                         } elseif( $n < 0xfc ) {
 302                                 $state = UTF8_TAIL;
 303                                 $remaining = 4;
 304                                 $sequence = $c;
 305                                 $head = $n;
 306                         } elseif( $n < 0xfe ) {
 307                                 $state = UTF8_TAIL;
 308                                 $remaining = 5;
 309                                 $sequence = $c;
 310                                 $head = $n;
 311                         } else {
 312                                 $out .= UTF8_REPLACEMENT;
 313                         }
 314                 }
 315                 if( $state == UTF8_TAIL ) {
 316                         $out .= UTF8_REPLACEMENT;
 317                 }
 318                 $string = $out;
 319                 return $looksNormal;
 320         }
 321
 322         # These take a string and run the normalization on them, without
 323         # checking for validity or any optimization etc. Input must be
 324         # VALID UTF-8!
 325         function NFC( $string ) {
 326                 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 327         }
 328
 329         function NFD( $string ) {
 330                 global $utfCanonicalDecomp;
 331                 return UtfNormal::fastCombiningSort(
 332                         UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
 333         }
 334
 335         function NFKC( $string ) {
 336                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 337         }
 338
 339         function NFKD( $string ) {
 340                 global $utfCompatibilityDecomp;
 341                 if( !isset( $utfCompatibilityDecomp ) ) {
 342                         require_once( 'UtfNormalDataK.inc' );
 343                 }
 344                 return UtfNormal::fastCombiningSort(
 345                         UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
 346         }
 347
 348
 349         /**
 350          * Perform decomposition of a UTF-8 string into either D or KD form
 351          * (depending on which decomposition map is passed to us).
 352          * Input is assumed to be *valid* UTF-8. Invalid code will break.
 353          * @private
 354          * @param string &$string Valid UTF-8 string
 355          * @param array &$map hash of expanded decomposition map
 356          * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
 357          */
 358         function fastDecompose( &$string, &$map ) {
 359                 $len = strlen( $string );
 360                 $out = '';
 361                 for( $i = 0; $i < $len; $i++ ) {
 362                         $c = $string{$i};
 363                         $n = ord( $c );
 364                         if( $n < 0x80 ) {
 365                                 # ASCII chars never decompose
 366                                 # THEY ARE IMMORTAL
 367                                 $out .= $c;
 368                                 continue;
 369                         } elseif( $n >= 0xf0 ) {
 370                                 $c = substr( $string, $i, 4 );
 371                                 $i += 3;
 372                         } elseif( $n >= 0xe0 ) {
 373                                 $c = substr( $string, $i, 3 );
 374                                 $i += 2;
 375                         } elseif( $n >= 0xc0 ) {
 376                                 $c = substr( $string, $i, 2 );
 377                                 $i++;
 378                         }
 379                         if( isset( $map[$c] ) ) {
 380                                 $out .= $map[$c];
 381                         } else {
 382                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 383                                         $out .= UtfNormal::decomposeHangul( $c );
 384                                 } else {
 385                                         $out .= $c;
 386                                 }
 387                         }
 388                 }
 389                 return $out;
 390         }
 391
 392         /**
 393          * Decompose a Hangul syllable character into its constituent jamo.
 394          * @param int $c Unicode code point of the character
 395          * @return string a UTF-8 string containing a sequence of jamo
 396          */
 397         function decomposeHangul( $c ) {
 398                 $codepoint = utf8ToCodepoint( $c );
 399                 $index = $codepoint - UNICODE_HANGUL_FIRST;
 400                 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
 401                 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
 402                 $t = $index % UNICODE_HANGUL_TCOUNT;
 403                 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
 404                 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
 405                 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
 406                 return $out;
 407         }
 408
 409         /**
 410          * Sorts combining characters into canonical order. This is the
 411          * final step in creating decomposed normal forms D and KD.
 412          * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
 413          * @return string a UTF-8 string with combining characters sorted in canonical order
 414          */
 415         function fastCombiningSort( $string ) {
 416                 global $utfCombiningClass;
 417                 $replacedCount = 1;
 418                 while( $replacedCount > 0 ) {
 419                         $replacedCount = 0;
 420                         $len = strlen( $string );
 421                         $out = '';
 422                         $lastClass = -1;
 423                         $lastChar = '';
 424                         for( $i = 0; $i < $len; $i++ ) {
 425                                 $c = $string{$i};
 426                                 $n = ord( $c );
 427                                 if( $n >= 0xf0 ) {
 428                                         $c = substr( $string, $i, 4 );
 429                                         $i += 3;
 430                                 } elseif( $n >= 0xe0 ) {
 431                                         $c = substr( $string, $i, 3 );
 432                                         $i += 2;
 433                                 } elseif( $n >= 0xc0 ) {
 434                                         $c = substr( $string, $i, 2 );
 435                                         $i++;
 436                                 }
 437                                 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 438                                 if( $lastClass == -1 ) {
 439                                         # First one
 440                                         $lastChar = $c;
 441                                         $lastClass = $class;
 442                                 } elseif( $lastClass > $class && $class > 0 ) {
 443                                         # Swap -- put this one on the stack
 444                                         $out .= $c;
 445                                         $replacedCount++;
 446                                 } else {
 447                                         $out .= $lastChar;
 448                                         $lastChar = $c;
 449                                         $lastClass = $class;
 450                                 }
 451                         }
 452                         $out .= $lastChar;
 453                         $string = $out;
 454                 }
 455                 return $string;
 456         }
 457
 458         /**
 459          * Produces canonically composed sequences, i.e. normal form C or KC.
 460          *
 461          * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
 462          * @return string a UTF-8 string with canonical precomposed characters used where possible
 463          */
 464         function fastCompose( $string ) {
 465                 global $utfCanonicalComp, $utfCombiningClass;
 466                 $len = strlen( $string );
 467                 $out = '';
 468                 $lastClass = -1;
 469                 $startChar = '';
 470                 $combining = '';
 471                 for( $i = 0; $i < $len; $i++ ) {
 472                         $c = $string{$i};
 473                         $n = ord( $c );
 474                         if( $n >= 0xf0 ) {
 475                                 $c = substr( $string, $i, 4 );
 476                                 $i += 3;
 477                         } elseif( $n >= 0xe0 ) {
 478                                 $c = substr( $string, $i, 3 );
 479                                 $i += 2;
 480                         } elseif( $n >= 0xc0 ) {
 481                                 $c = substr( $string, $i, 2 );
 482                                 $i++;
 483                         }
 484                         $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 485                         $pair = $startChar . $c;
 486                         if( empty( $utfCombiningClass[$c] ) ) {
 487                                 # New start char
 488                                 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
 489                                         $startChar = $utfCanonicalComp[$pair];
 490                                 } elseif( $lastClass == 0 &&
 491                                           $c >= UTF8_HANGUL_VBASE &&
 492                                           $c <= UTF8_HANGUL_VEND &&
 493                                           $startChar >= UTF8_HANGUL_LBASE &&
 494                                           $startChar <= UTF8_HANGUL_LEND ) {
 495                                         $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 496                                         $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 497                                         $hangulPoint = UNICODE_HANGUL_FIRST +
 498                                                 UNICODE_HANGUL_TCOUNT *
 499                                                 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
 500                                         $startChar = codepointToUtf8( $hangulPoint );
 501                                 } elseif( $lastClass == 0 &&
 502                                           $c >= UTF8_HANGUL_TBASE &&
 503                                           $c <= UTF8_HANGUL_TEND &&
 504                                           $startChar >= UTF8_HANGUL_FIRST &&
 505                                           $startChar <= UTF8_HANGUL_LAST ) {
 506                                         $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 507                                         $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
 508                                         $startChar = codepointToUtf8( $hangulPoint );
 509                                 } else {
 510                                         $out .= $startChar;
 511                                         $out .= $combining;
 512                                         $startChar = $c;
 513                                         $combining = '';
 514                                 }
 515                         } else {
 516                                 # A combining char; see what we can do with it
 517                                 if( !empty( $startChar ) &&
 518                                         $lastClass < $class &&
 519                                         $class > 0 &&
 520                                         isset( $utfCanonicalComp[$pair] ) ) {
 521                                         $startChar = $utfCanonicalComp[$pair];
 522                                         $class = 0;
 523                                 } else {
 524                                         $combining .= $c;
 525                                 }
 526                         }
 527                         $lastClass = $class;
 528                 }
 529                 $out .= $startChar . $combining;
 530                 return $out;
 531         }
 532
 533         /**
 534          * This is just used for the benchmark, comparing how long it takes to
 535          * interate through a string without really doing anything of substance.
 536          * @param string $string
 537          * @return string
 538          */
 539         function placebo( $string ) {
 540                 $len = strlen( $string );
 541                 $out = '';
 542                 for( $i = 0; $i < $len; $i++ ) {
 543                         $out .= $string{$i};
 544                 }
 545                 return $out;
 546         }
 547 }
 548
 549 ?>