lib/typo3/class.t3lib_cs.php

   1 <?php
   2 /***************************************************************
   3 *  Copyright notice
   4 *
   5 *  (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
   6 *  All rights reserved
   7 *
   8 *  This script is part of the Typo3 project. The Typo3 project is
   9 *  free software; you can redistribute it and/or modify
  10 *  it under the terms of the GNU General Public License as published by
  11 *  the Free Software Foundation; either version 2 of the License, or
  12 *  (at your option) any later version.
  13 *
  14 *  The GNU General Public License can be found at
  15 *  http://www.gnu.org/copyleft/gpl.html.
  16 *
  17 *  This script is distributed in the hope that it will be useful,
  18 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 *  GNU General Public License for more details.
  21 *
  22 *  This copyright notice MUST APPEAR in all copies of the script!
  23 ***************************************************************/
  24 /**
  25  * Class for conversion between charsets.
  26  *
  27  *    Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $
  28  * Moodle $Id$
  29  *
  30  * @author      Kasper Skaarhoj <kasperYYYY@typo3.com>
  31  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
  32  */
  33 /**
  34  * [CLASS/FUNCTION INDEX of SCRIPT]
  35  *
  36  *
  37  *
  38  *  136: class t3lib_cs
  39  *  488:     function parse_charset($charset)
  40  *  507:     function get_locale_charset($locale)
  41  *
  42  *              SECTION: Charset Conversion functions
  43  *  560:     function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
  44  *  600:     function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
  45  *  617:     function utf8_encode($str,$charset)
  46  *  663:     function utf8_decode($str,$charset,$useEntityForNoChar=0)
  47  *  706:     function utf8_to_entities($str)
  48  *  739:     function entities_to_utf8($str,$alsoStdHtmlEnt=0)
  49  *  773:     function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
  50  *  823:     function UnumberToChar($cbyte)
  51  *  868:     function utf8CharToUnumber($str,$hex=0)
  52  *
  53  *              SECTION: Init functions
  54  *  911:     function initCharset($charset)
  55  *  973:     function initUnicodeData($mode=null)
  56  * 1198:     function initCaseFolding($charset)
  57  * 1260:     function initToASCII($charset)
  58  *
  59  *              SECTION: String operation functions
  60  * 1331:     function substr($charset,$string,$start,$len=null)
  61  * 1384:     function strlen($charset,$string)
  62  * 1414:     function crop($charset,$string,$len,$crop='')
  63  * 1467:     function strtrunc($charset,$string,$len)
  64  * 1501:     function conv_case($charset,$string,$case)
  65  * 1527:     function specCharsToASCII($charset,$string)
  66  *
  67  *              SECTION: Internal string operation functions
  68  * 1567:     function sb_char_mapping($str,$charset,$mode,$opt='')
  69  *
  70  *              SECTION: Internal UTF-8 string operation functions
  71  * 1622:     function utf8_substr($str,$start,$len=null)
  72  * 1655:     function utf8_strlen($str)
  73  * 1676:     function utf8_strtrunc($str,$len)
  74  * 1698:     function utf8_strpos($haystack,$needle,$offset=0)
  75  * 1723:     function utf8_strrpos($haystack,$needle)
  76  * 1745:     function utf8_char2byte_pos($str,$pos)
  77  * 1786:     function utf8_byte2char_pos($str,$pos)
  78  * 1809:     function utf8_char_mapping($str,$mode,$opt='')
  79  *
  80  *              SECTION: Internal EUC string operation functions
  81  * 1885:     function euc_strtrunc($str,$len,$charset)
  82  * 1914:     function euc_substr($str,$start,$charset,$len=null)
  83  * 1939:     function euc_strlen($str,$charset)
  84  * 1966:     function euc_char2byte_pos($str,$pos,$charset)
  85  * 2007:     function euc_char_mapping($str,$charset,$mode,$opt='')
  86  *
  87  * TOTAL FUNCTIONS: 35
  88  * (This index is automatically created/updated by the extension "extdeveval")
  89  *
  90  */
  91
  92
  93
  94
  95
  96
  97
  98
  99 /**
 100  * Notes on UTF-8
 101  *
 102  * Functions working on UTF-8 strings:
 103  *
 104  * - strchr/strstr
 105  * - strrchr
 106  * - substr_count
 107  * - implode/explode/join
 108  *
 109  * Functions nearly working on UTF-8 strings:
 110  *
 111  * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
 112  * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
 113  * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
 114  * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
 115  *
 116  * Functions NOT working on UTF-8 strings:
 117  *
 118  * - str*cmp
 119  * - stristr
 120  * - stripos
 121  * - substr
 122  * - strrev
 123  * - ereg/eregi
 124  * - split/spliti
 125  * - preg_*
 126  * - ...
 127  *
 128  */
 129 /**
 130  * Class for conversion between charsets
 131  *
 132  * @author      Kasper Skaarhoj <kasperYYYY@typo3.com>
 133  * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 134  * @package TYPO3
 135  * @subpackage t3lib
 136  */
 137 class t3lib_cs {
 138         var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
 139
 140                 // This is the array where parsed conversion tables are stored (cached)
 141         var $parsedCharsets=array();
 142
 143                 // An array where case folding data will be stored (cached)
 144         var $caseFolding=array();
 145
 146                 // An array where charset-to-ASCII mappings are stored (cached)
 147         var $toASCII=array();
 148
 149                 // This tells the converter which charsets has two bytes per char:
 150         var $twoByteSets=array(
 151                 'ucs-2'=>1,     // 2-byte Unicode
 152         );
 153
 154                 // This tells the converter which charsets has four bytes per char:
 155         var $fourByteSets=array(
 156                 'ucs-4'=>1,     // 4-byte Unicode
 157                 'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
 158         );
 159
 160                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
 161         var $eucBasedSets=array(
 162                 'gb2312'=>1,            // Chinese, simplified.
 163                 'big5'=>1,              // Chinese, traditional.
 164                 'euc-kr'=>1,            // Korean
 165                 'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 166         );
 167
 168                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 169                 // http://czyborra.com/charsets/iso8859.html
 170         var $synonyms=array(
 171                 'us' => 'ascii',
 172                 'us-ascii'=> 'ascii',
 173                 'cp819' => 'iso-8859-1',
 174                 'ibm819' => 'iso-8859-1',
 175                 'iso-ir-100' => 'iso-8859-1',
 176                 'iso-ir-109' => 'iso-8859-2',
 177                 'iso-ir-148' => 'iso-8859-9',
 178                 'iso-ir-199' => 'iso-8859-14',
 179                 'iso-ir-203' => 'iso-8859-15',
 180                 'csisolatin1' => 'iso-8859-1',
 181                 'csisolatin2' => 'iso-8859-2',
 182                 'csisolatin3' => 'iso-8859-3',
 183                 'csisolatin5' => 'iso-8859-9',
 184                 'csisolatin8' => 'iso-8859-14',
 185                 'csisolatin9' => 'iso-8859-15',
 186                 'csisolatingreek' => 'iso-8859-7',
 187                 'iso-celtic' => 'iso-8859-14',
 188                 'latin1' => 'iso-8859-1',
 189                 'latin2' => 'iso-8859-2',
 190                 'latin3' => 'iso-8859-3',
 191                 'latin5' => 'iso-8859-9',
 192                 'latin6' => 'iso-8859-10',
 193                 'latin8' => 'iso-8859-14',
 194                 'latin9' => 'iso-8859-15',
 195                 'l1' => 'iso-8859-1',
 196                 'l2' => 'iso-8859-2',
 197                 'l3' => 'iso-8859-3',
 198                 'l5' => 'iso-8859-9',
 199                 'l6' => 'iso-8859-10',
 200                 'l8' => 'iso-8859-14',
 201                 'l9' => 'iso-8859-15',
 202                 'cyrillic' => 'iso-8859-5',
 203                 'arabic' => 'iso-8859-6',
 204                 'tis-620' => 'iso-8859-11',
 205                 'win874' => 'windows-874',
 206                 'win1250' => 'windows-1250',
 207                 'win1251' => 'windows-1251',
 208                 'win1252' => 'windows-1252',
 209                 'win1253' => 'windows-1253',
 210                 'win1254' => 'windows-1254',
 211                 'win1255' => 'windows-1255',
 212                 'win1256' => 'windows-1256',
 213                 'win1257' => 'windows-1257',
 214                 'win1258' => 'windows-1258',
 215                 'cp1250' => 'windows-1250',
 216                 'cp1251' => 'windows-1251',
 217                 'cp1252' => 'windows-1252',
 218                 'ms-ee' => 'windows-1250',
 219                 'ms-ansi' => 'windows-1252',
 220                 'ms-greek' => 'windows-1253',
 221                 'ms-turk' => 'windows-1254',
 222                 'winbaltrim' => 'windows-1257',
 223                 'koi-8ru' => 'koi-8r',
 224                 'koi8r' => 'koi-8r',
 225                 'cp878' => 'koi-8r',
 226                 'mac' => 'macroman',
 227                 'macintosh' => 'macroman',
 228                 'euc-cn' => 'gb2312',
 229                 'x-euc-cn' => 'gb2312',
 230                 'euccn' => 'gb2312',
 231                 'cp936' => 'gb2312',
 232                 'big-5' => 'big5',
 233                 'cp950' => 'big5',
 234                 'eucjp' => 'euc-jp',
 235                 'sjis' => 'shift_jis',
 236                 'shift-jis' => 'shift_jis',
 237                 'cp932' => 'shift_jis',
 238                 'cp949' => 'euc-kr',
 239                 'utf7' => 'utf-7',
 240                 'utf8' => 'utf-8',
 241                 'utf16' => 'utf-16',
 242                 'utf32' => 'utf-32',
 243                 'utf8' => 'utf-8',
 244                 'ucs2' => 'ucs-2',
 245                 'ucs4' => 'ucs-4',
 246         );
 247
 248                 // mapping of iso-639:2 language codes to script names
 249         var $lang_to_script=array(
 250                         // iso-639:2 language codes, see:
 251                         //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
 252                         //  http://www.loc.gov/standards/iso639-2/langcodes.html
 253                         //  http://www.unicode.org/onlinedat/languages.html
 254                 'ar' => 'arabic',
 255                 'bg' => 'cyrillic',             // Bulgarian
 256                 'bs' => 'east_european',        // Bosnian
 257                 'cs' => 'east_european',        // Czech
 258                 'da' => 'west_european',        // Danish
 259                 'de' => 'west_european',        // German
 260                 'es' => 'west_european',        // Spanish
 261                 'et' => 'estonian',
 262                 'eo' => 'unicode',              // Esperanto
 263                 'eu' => 'west_european',        // Basque
 264                 'fa' => 'arabic',       // Persian
 265                 'fi' => 'west_european',        // Finish
 266                 'fo' => 'west_european',        // Faroese
 267                 'fr' => 'west_european',        // French
 268                 'gr' => 'greek',
 269                 'he' => 'hebrew',               // Hebrew (since 1998)
 270                 'hi' => 'unicode',              // Hindi
 271                 'hr' => 'east_european',        // Croatian
 272                 'hu' => 'east_european',        // Hungarian
 273                 'iw' => 'hebrew',               // Hebrew (til 1998)
 274                 'is' => 'west_european',        // Icelandic
 275                 'it' => 'west_european',        // Italian
 276                 'ja' => 'japanese',
 277                 'kl' => 'west_european',        // Greenlandic
 278                 'ko' => 'korean',
 279                 'lt' => 'lithuanian',
 280                 'lv' => 'west_european',        // Latvian/Lettish
 281                 'nl' => 'west_european',        // Dutch
 282                 'no' => 'west_european',        // Norwegian
 283                 'pl' => 'east_european',        // Polish
 284                 'pt' => 'west_european',        // Portuguese
 285                 'ro' => 'east_european',        // Romanian
 286                 'ru' => 'cyrillic',             // Russian
 287                 'sk' => 'east_european',        // Slovak
 288                 'sl' => 'east_european',        // Slovenian
 289                 'sr' => 'cyrillic',             // Serbian
 290                 'sv' => 'west_european',        // Swedish
 291                 'th' => 'thai',
 292                 'uk' => 'cyrillic',             // Ukranian
 293                 'vi' => 'vietnamese',
 294                 'zh' => 'chinese',
 295                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 296                         // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 297                 'ara' => 'arabic',
 298                 'bgr' => 'cyrillic',            // Bulgarian
 299                 'cat' => 'west_european',       // Catalan
 300                 'chs' => 'simpl_chinese',
 301                 'cht' => 'trad_chinese',
 302                 'csy' => 'east_european',       // Czech
 303                 'dan' => 'west_european',       // Danisch
 304                 'deu' => 'west_european',       // German
 305                 'dea' => 'west_european',       // German (Austrian)
 306                 'des' => 'west_european',       // German (Swiss)
 307                 'ena' => 'west_european',       // English (Australian)
 308                 'enc' => 'west_european',       // English (Canadian)
 309                 'eng' => 'west_european',       // English
 310                 'enz' => 'west_european',       // English (New Zealand)
 311                 'enu' => 'west_european',       // English (United States)
 312                 'euq' => 'west_european',       // Basque
 313                 'fos' => 'west_european',       // Faroese
 314                 'far' => 'arabic',      // Persian
 315                 'fin' => 'west_european',       // Finish
 316                 'fra' => 'west_european',       // French
 317                 'frb' => 'west_european',       // French (Belgian)
 318                 'frc' => 'west_european',       // French (Canadian)
 319                 'frs' => 'west_european',       // French (Swiss)
 320                 'ell' => 'greek',
 321                 'heb' => 'hebrew',
 322                 'hin' => 'unicode',     // Hindi
 323                 'hun' => 'east_european',       // Hungarian
 324                 'isl' => 'west_euorpean',       // Icelandic
 325                 'ita' => 'west_european',       // Italian
 326                 'its' => 'west_european',       // Italian (Swiss)
 327                 'jpn' => 'japanese',
 328                 'kor' => 'korean',
 329                 'lth' => 'lithuanian',
 330                 'lvi' => 'west_european',       // Latvian/Lettish
 331                 'msl' => 'west_european',       // Malay
 332                 'nlb' => 'west_european',       // Dutch (Belgian)
 333                 'nld' => 'west_european',       // Dutch
 334                 'nor' => 'west_european',       // Norwegian (bokmal)
 335                 'non' => 'west_european',       // Norwegian (nynorsk)
 336                 'plk' => 'east_european',       // Polish
 337                 'ptg' => 'west_european',       // Portuguese
 338                 'ptb' => 'west_european',       // Portuguese (Brazil)
 339                 'rom' => 'east_european',       // Romanian
 340                 'rus' => 'cyrillic',            // Russian
 341                 'slv' => 'east_european',       // Slovenian
 342                 'sky' => 'east_european',       // Slovak
 343                 'srl' => 'east_european',       // Serbian (Latin)
 344                 'srb' => 'cyrillic',            // Serbian (Cyrillic)
 345                 'esp' => 'west_european',       // Spanish (trad. sort)
 346                 'esm' => 'west_european',       // Spanish (Mexican)
 347                 'esn' => 'west_european',       // Spanish (internat. sort)
 348                 'sve' => 'west_european',       // Swedish
 349                 'tha' => 'thai',
 350                 'trk' => 'turkish',
 351                 'ukr' => 'cyrillic',    // Ukrainian
 352                         // English language names
 353                 'arabic' => 'arabic',
 354                 'basque' => 'west_european',
 355                 'bosnian' => 'east_european',
 356                 'bulgarian' => 'east_european',
 357                 'catalan' => 'west_european',
 358                 'croatian' => 'east_european',
 359                 'czech' => 'east_european',
 360                 'danish' => 'west_european',
 361                 'dutch' => 'west_european',
 362                 'english' => 'west_european',
 363                 'esperanto' => 'unicode',
 364                 'estonian' => 'estonian',
 365                 'faroese' => 'west_european',
 366                 'farsi' => 'arabic',
 367                 'finnish' => 'west_european',
 368                 'french' => 'west_european',
 369                 'galician' => 'west_european',
 370                 'german' => 'west_european',
 371                 'greek' => 'greek',
 372                 'greenlandic' => 'west_european',
 373                 'hebrew' => 'hebrew',
 374                 'hindi' => 'unicode',
 375                 'hungarian' => 'east_european',
 376                 'icelandic' => 'west_european',
 377                 'italian' => 'west_european',
 378                 'latvian' => 'west_european',
 379                 'lettish' => 'west_european',
 380                 'lithuanian' => 'lithuanian',
 381                 'malay' => 'west_european',
 382                 'norwegian' => 'west_european',
 383                 'persian' => 'arabic',
 384                 'polish' => 'east_european',
 385                 'portuguese' => 'west_european',
 386                 'russian' => 'cyrillic',
 387                 'romanian' => 'east_european',
 388                 'serbian' => 'cyrillic',
 389                 'slovak' => 'east_european',
 390                 'slovenian' => 'east_european',
 391                 'spanish' => 'west_european',
 392                 'svedish' => 'west_european',
 393                 'that' => 'thai',
 394                 'turkish' => 'turkish',
 395                 'ukrainian' => 'cyrillic',
 396         );
 397
 398                 // mapping of language (family) names to charsets on Unix
 399         var $script_to_charset_unix=array(
 400                 'west_european' => 'iso-8859-1',
 401                 'estonian' => 'iso-8859-1',
 402                 'east_european' => 'iso-8859-2',
 403                 'baltic' => 'iso-8859-4',
 404                 'cyrillic' => 'iso-8859-5',
 405                 'arabic' => 'iso-8859-6',
 406                 'greek' => 'iso-8859-7',
 407                 'hebrew' => 'iso-8859-8',
 408                 'turkish' => 'iso-8859-9',
 409                 'thai' => 'iso-8859-11', // = TIS-620
 410                 'lithuanian' => 'iso-8859-13',
 411                 'chinese' => 'gb2312', // = euc-cn
 412                 'japanese' => 'euc-jp',
 413                 'korean' => 'euc-kr',
 414                 'simpl_chinese' => 'gb2312',
 415                 'trad_chinese' => 'big5',
 416                 'vietnamese' => '',
 417                 'unicode' => 'utf-8',
 418         );
 419
 420                 // mapping of language (family) names to charsets on Windows
 421         var $script_to_charset_windows=array(
 422                 'east_european' => 'windows-1250',
 423                 'cyrillic' => 'windows-1251',
 424                 'west_european' => 'windows-1252',
 425                 'greek' => 'windows-1253',
 426                 'turkish' => 'windows-1254',
 427                 'hebrew' => 'windows-1255',
 428                 'arabic' => 'windows-1256',
 429                 'baltic' => 'windows-1257',
 430                 'estonian' => 'windows-1257',
 431                 'lithuanian' => 'windows-1257',
 432                 'vietnamese' => 'windows-1258',
 433                 'thai' => 'cp874',
 434                 'korean' => 'cp949',
 435                 'chinese' => 'gb2312',
 436                 'japanese' => 'shift_jis',
 437                 'simpl_chinese' => 'gb2312',
 438                 'trad_chinese' => 'big5',
 439         );
 440
 441                 // mapping of locale names to charsets
 442         var $locale_to_charset=array(
 443                 'japanese.euc' => 'euc-jp',
 444                 'ja_jp.ujis' => 'euc-jp',
 445                 'korean.euc' => 'euc-kr',
 446                 'sr@Latn' => 'iso-8859-2',
 447                 'zh_cn' => 'gb2312',
 448                 'zh_hk' => 'big5',
 449                 'zh_tw' => 'big5',
 450         );
 451
 452                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 453                 // Empty values means "iso-8859-1"
 454         var $charSetArray = array(
 455                 'dk' => '',
 456                 'de' => '',
 457                 'no' => '',
 458                 'it' => '',
 459                 'fr' => '',
 460                 'es' => '',
 461                 'nl' => '',
 462                 'cz' => 'windows-1250',
 463                 'pl' => 'iso-8859-2',
 464                 'si' => 'windows-1250',
 465                 'fi' => '',
 466                 'tr' => 'iso-8859-9',
 467                 'se' => '',
 468                 'pt' => '',
 469                 'ru' => 'windows-1251',
 470                 'ro' => 'iso-8859-2',
 471                 'ch' => 'gb2312',
 472                 'sk' => 'windows-1250',
 473                 'lt' => 'windows-1257',
 474                 'is' => 'utf-8',
 475                 'hr' => 'windows-1250',
 476                 'hu' => 'iso-8859-2',
 477                 'gl' => '',
 478                 'th' => 'iso-8859-11',
 479                 'gr' => 'iso-8859-7',
 480                 'hk' => 'big5',
 481                 'eu' => '',
 482                 'bg' => 'windows-1251',
 483                 'br' => '',
 484                 'et' => 'iso-8859-4',
 485                 'ar' => 'iso-8859-6',
 486                 'he' => 'utf-8',
 487                 'ua' => 'windows-1251',
 488                 'jp' => 'shift_jis',
 489                 'lv' => 'utf-8',
 490                 'vn' => 'utf-8',
 491                 'ca' => 'iso-8859-15',
 492                 'ba' => 'iso-8859-2',
 493                 'kr' => 'euc-kr',
 494                 'eo' => 'utf-8',
 495                 'my' => '',
 496                 'hi' => 'utf-8',
 497                 'fo' => 'utf-8',
 498                 'fa' => 'utf-8',
 499                 'sr' => 'utf-8'
 500         );
 501
 502                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 503                 // Missing keys means: same as Typo3
 504         var $isoArray = array(
 505                 'ba' => 'bs',
 506                 'br' => 'pt_BR',
 507                 'ch' => 'zh_CN',
 508                 'cz' => 'cs',
 509                 'dk' => 'da',
 510                 'si' => 'sl',
 511                 'se' => 'sv',
 512                 'gl' => 'kl',
 513                 'gr' => 'el',
 514                 'hk' => 'zh_HK',
 515                 'kr' => 'ko',
 516                 'ua' => 'uk',
 517                 'jp' => 'ja',
 518                 'vn' => 'vi',
 519         );
 520
 521         /**
 522          * Normalize - changes input character set to lowercase letters.
 523          *
 524          * @param       string          Input charset
 525          * @return      string          Normalized charset
 526          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 527          */
 528         function parse_charset($charset)        {
 529                 $charset = strtolower($charset);
 530                 if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
 531
 532                 return $charset;
 533         }
 534
 535         /**
 536          * Get the charset of a locale.
 537          *
 538          * ln            language
 539          * ln_CN         language / country
 540          * ln_CN.cs      language / country / charset
 541          * ln_CN.cs@mod  language / country / charset / modifier
 542          *
 543          * @param       string          Locale string
 544          * @return      string          Charset resolved for locale string
 545          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
 546          */
 547         function get_locale_charset($locale)    {
 548                 $locale = strtolower($locale);
 549
 550                         // exact locale specific charset?
 551                 if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
 552
 553                         // get modifier
 554                 list($locale,$modifier) = explode('@',$locale);
 555
 556                         // locale contains charset: use it
 557                 list($locale,$charset) = explode('.',$locale);
 558                 if ($charset)   return $this->parse_charset($charset);
 559
 560                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 561                 if ($modifier == 'euro')        return 'iso-8859-15';
 562
 563                         // get language
 564                 list($language,$country) = explode('_',$locale);
 565                 if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
 566
 567                 if (TYPO3_OS == 'WIN')  {
 568                         $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
 569                 } else {
 570                         $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
 571                 }
 572
 573                 return $cs;
 574         }
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584         /********************************************
 585          *
 586          * Charset Conversion functions
 587          *
 588          ********************************************/
 589
 590         /**
 591          * Convert from one charset to another charset.
 592          *
 593          * @param       string          Input string
 594          * @param       string          From charset (the current charset of the string)
 595          * @param       string          To charset (the output charset wanted)
 596          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 597          * @return      string          Converted string
 598          * @see convArray()
 599          */
 600         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
 601                 if ($fromCS==$toCS)     return $str;
 602
 603                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 604                 if ($toCS=='utf-8' || !$useEntityForNoChar)     {
 605                         switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
 606                         case 'mbstring':
 607                                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
 608                                 if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
 609                                 break;
 610
 611                         case 'iconv':
 612                                 $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str);
 613                                 if (false !== $conv_str)        return $conv_str;
 614                                 break;
 615
 616                         case 'recode':
 617                                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
 618                                 if (false !== $conv_str)        return $conv_str;
 619                                 break;
 620                         }
 621                         // fallback to TYPO3 conversion
 622                 }
 623
 624                 if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
 625                 if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
 626                 return $str;
 627         }
 628
 629         /**
 630          * Convert all elements in ARRAY from one charset to another charset.
 631          * NOTICE: Array is passed by reference!
 632          *
 633          * @param       string          Input array, possibly multidimensional
 634          * @param       string          From charset (the current charset of the string)
 635          * @param       string          To charset (the output charset wanted)
 636          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 637          * @return      void
 638          * @see conv()
 639          */
 640         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
 641                 foreach($array as $key => $value)       {
 642                         if (is_array($array[$key]))     {
 643                                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 644                         } else {
 645                                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
 646                         }
 647                 }
 648         }
 649
 650         /**
 651          * Converts $str from $charset to UTF-8
 652          *
 653          * @param       string          String in local charset to convert to UTF-8
 654          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 655          * @return      string          Output string, converted to UTF-8
 656          */
 657         function utf8_encode($str,$charset)     {
 658
 659                 if ($charset === 'utf-8')       return $str;
 660
 661                         // Charset is case-insensitive.
 662                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
 663                         $strLen = strlen($str);
 664                         $outStr='';
 665
 666                         for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
 667                                 $chr=substr($str,$a,1);
 668                                 $ord=ord($chr);
 669                                 if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
 670                                         $ord2 = ord($str{$a+1});
 671                                         $ord = $ord<<8 | $ord2; // assume big endian
 672
 673                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 674                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
 675                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
 676                                         $a++;
 677                                 } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
 678                                         if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 679                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
 680                                                         $a++;
 681                                                         $ord2=ord(substr($str,$a,1));
 682                                                         $ord = $ord*256+$ord2;
 683                                                 }
 684                                         }
 685
 686                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 687                                                 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
 688                                         } else $outStr.= chr($this->noCharByteVal);     // No char exists
 689                                 } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 690                         }
 691                         return $outStr;
 692                 }
 693         }
 694
 695         /**
 696          * Converts $str from UTF-8 to $charset
 697          *
 698          * @param       string          String in UTF-8 to convert to local charset
 699          * @param       string          Charset, lowercase. Must be found in csconvtbl/ folder.
 700          * @param       boolean         If set, then characters that are not available in the destination character set will be encoded as numeric entities
 701          * @return      string          Output string, converted to local charset
 702          */
 703         function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
 704
 705                         // Charset is case-insensitive.
 706                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
 707                         $strLen = strlen($str);
 708                         $outStr='';
 709                         $buf='';
 710                         for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {       // Traverse each char in UTF-8 string.
 711                                 $chr=substr($str,$a,1);
 712                                 $ord=ord($chr);
 713                                 if ($ord>127)   {       // This means multibyte! (first byte!)
 714                                         if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 715
 716                                                 $buf=$chr;      // Add first byte
 717                                                 for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 718                                                         $ord = $ord << 1;       // Shift it left and ...
 719                                                         if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 720                                                                 $a++;   // Increase pointer...
 721                                                                 $buf.=substr($str,$a,1);        // ... and add the next char.
 722                                                         } else break;
 723                                                 }
 724
 725                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
 726                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 727                                                         if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 728                                                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
 729                                                         } else $outStr.= chr($mByte);
 730                                                 } elseif ($useEntityForNoChar) {        // Create num entity:
 731                                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 732                                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists
 733                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
 734                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 735                         }
 736                         return $outStr;
 737                 }
 738         }
 739
 740         /**
 741          * Converts all chars > 127 to numeric entities.
 742          *
 743          * @param       string          Input string
 744          * @return      string          Output string
 745          */
 746         function utf8_to_entities($str) {
 747                 $strLen = strlen($str);
 748                 $outStr='';
 749                 $buf='';
 750                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
 751                         $chr=substr($str,$a,1);
 752                         $ord=ord($chr);
 753                         if ($ord>127)   {       // This means multibyte! (first byte!)
 754                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 755                                         $buf=$chr;      // Add first byte
 756                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 757                                                 $ord = $ord << 1;       // Shift it left and ...
 758                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 759                                                         $a++;   // Increase pointer...
 760                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
 761                                                 } else break;
 762                                         }
 763
 764                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
 765                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
 766                         } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 767                 }
 768
 769                 return $outStr;
 770         }
 771
 772         /**
 773          * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 774          *
 775          * @param       string          Input string, UTF-8
 776          * @param       boolean         If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 777          * @return      string          Output string
 778          */
 779         function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
 780                 if ($alsoStdHtmlEnt)    {
 781                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
 782                 }
 783
 784                 $token = md5(microtime());
 785                 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
 786                 foreach($parts as $k => $v)     {
 787                         if ($k%2)       {
 788                                 if (substr($v,0,1)=='#')        {       // Dec or hex entities:
 789                                         if (substr($v,1,1)=='x')        {
 790                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
 791                                         } else {
 792                                                 $parts[$k] = $this->UnumberToChar(substr($v,1));
 793                                         }
 794                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
 795                                         $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
 796                                 } else {        // No conversion:
 797                                         $parts[$k] ='&'.$v.';';
 798                                 }
 799                         }
 800                 }
 801
 802                 return implode('',$parts);
 803         }
 804
 805         /**
 806          * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 807          *
 808          * @param       string          Input string, UTF-8
 809          * @param       boolean         If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 810          * @param       boolean         If set, then instead of integer numbers the real UTF-8 char is returned.
 811          * @return      array           Output array with the char numbers
 812          */
 813         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
 814                         // If entities must be registered as well...:
 815                 if ($convEntities)      {
 816                         $str = $this->entities_to_utf8($str,1);
 817                 }
 818                         // Do conversion:
 819                 $strLen = strlen($str);
 820                 $outArr=array();
 821                 $buf='';
 822                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
 823                         $chr=substr($str,$a,1);
 824                         $ord=ord($chr);
 825                         if ($ord>127)   {       // This means multibyte! (first byte!)
 826                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 827                                         $buf=$chr;      // Add first byte
 828                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 829                                                 $ord = $ord << 1;       // Shift it left and ...
 830                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 831                                                         $a++;   // Increase pointer...
 832                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
 833                                                 } else break;
 834                                         }
 835
 836                                         $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
 837                                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
 838                         } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 839                 }
 840
 841                 return $outArr;
 842         }
 843
 844         /**
 845          * Converts a UNICODE number to a UTF-8 multibyte character
 846          * Algorithm based on script found at From: http://czyborra.com/utf/
 847          * Unit-tested by Kasper
 848          *
 849          * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 850          *
 851          *  bytes | bits | representation
 852          *      1 |    7 | 0vvvvvvv
 853          *      2 |   11 | 110vvvvv 10vvvvvv
 854          *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 855          *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 856          *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 857          *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 858          *
 859          * @param       integer         UNICODE integer
 860          * @return      string          UTF-8 multibyte character string
 861          * @see utf8CharToUnumber()
 862          */
 863         function UnumberToChar($cbyte)  {
 864                 $str='';
 865
 866                 if ($cbyte < 0x80) {
 867                         $str.=chr($cbyte);
 868                 } else if ($cbyte < 0x800) {
 869                         $str.=chr(0xC0 | ($cbyte >> 6));
 870                         $str.=chr(0x80 | ($cbyte & 0x3F));
 871                 } else if ($cbyte < 0x10000) {
 872                         $str.=chr(0xE0 | ($cbyte >> 12));
 873                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 874                         $str.=chr(0x80 | ($cbyte & 0x3F));
 875                 } else if ($cbyte < 0x200000) {
 876                         $str.=chr(0xF0 | ($cbyte >> 18));
 877                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 878                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 879                         $str.=chr(0x80 | ($cbyte & 0x3F));
 880                 } else if ($cbyte < 0x4000000) {
 881                         $str.=chr(0xF8 | ($cbyte >> 24));
 882                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 883                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 884                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 885                         $str.=chr(0x80 | ($cbyte & 0x3F));
 886                 } else if ($cbyte < 0x80000000) {
 887                         $str.=chr(0xFC | ($cbyte >> 30));
 888                         $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
 889                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
 890                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
 891                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
 892                         $str.=chr(0x80 | ($cbyte & 0x3F));
 893                 } else { // Cannot express a 32-bit character in UTF-8
 894                         $str .= chr($this->noCharByteVal);
 895                 }
 896                 return $str;
 897         }
 898
 899         /**
 900          * Converts a UTF-8 Multibyte character to a UNICODE number
 901          * Unit-tested by Kasper
 902          *
 903          * @param       string          UTF-8 multibyte character string
 904          * @param       boolean         If set, then a hex. number is returned.
 905          * @return      integer         UNICODE integer
 906          * @see UnumberToChar()
 907          */
 908         function utf8CharToUnumber($str,$hex=0) {
 909                 $ord=ord(substr($str,0,1));     // First char
 910
 911                 if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
 912                         $binBuf='';
 913                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
 914                                 $ord = $ord << 1;       // Shift it left and ...
 915                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 916                                         $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
 917                                 } else break;
 918                         }
 919                         $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
 920
 921                         $int = bindec($binBuf);
 922                 } else $int = $ord;
 923
 924                 return $hex ? 'x'.dechex($int) : $int;
 925         }
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935         /********************************************
 936          *
 937          * Init functions
 938          *
 939          ********************************************/
 940
 941         /**
 942          * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
 943          * This function is automatically called by the conversion functions
 944          *
 945          * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
 946          *
 947          * @param       string          The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
 948          * @return      integer         Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
 949          * @access private
 950          */
 951         function initCharset($charset)  {
 952                         // Only process if the charset is not yet loaded:
 953                 if (!is_array($this->parsedCharsets[$charset])) {
 954
 955                                 // Conversion table filename:
 956                         $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
 957
 958                                 // If the conversion table is found:
 959                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
 960                                         // Cache file for charsets:
 961                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
 962                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
 963                                 if ($cacheFile && @is_file($cacheFile)) {
 964                                         $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
 965                                 } else {
 966                                                 // Parse conversion table into lines:
 967                                         $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
 968                                                 // Initialize the internal variable holding the conv. table:
 969                                         $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
 970                                                 // traverse the lines:
 971                                         $detectedType='';
 972                                         foreach($lines as $value)       {
 973                                                 if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
 974
 975                                                                 // Detect type if not done yet: (Done on first real line)
 976                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
 977                                                         if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
 978
 979                                                         if ($detectedType=='ms-token')  {
 980                                                                 list($hexbyte,$utf8) = split('=|:',$value,3);
 981                                                         } elseif ($detectedType=='whitespaced') {
 982                                                                 $regA=array();
 983                                                                 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
 984                                                                 $hexbyte = $regA[1];
 985                                                                 $utf8 = 'U+'.$regA[2];
 986                                                         }
 987                                                         $decval = hexdec(trim($hexbyte));
 988                                                         if ($decval>127)        {
 989                                                                 $utf8decval = hexdec(substr(trim($utf8),2));
 990                                                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
 991                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
 992                                                         }
 993                                                 }
 994                                         }
 995                                         if ($cacheFile) {
 996                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
 997                                         }
 998                                 }
 999                                 return 2;
1000                         } else return false;
1001                 } else return 1;
1002         }
1003
1004         /**
1005          * This function initializes all UTF-8 character data tables.
1006          *
1007          * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1008          *
1009          * @param       string          Mode ("case", "ascii", ...)
1010          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1011          * @access private
1012          */
1013         function initUnicodeData($mode=null)    {
1014                         // cache files
1015                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1016                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1017
1018                         // Only process if the tables are not yet loaded
1019                 switch($mode)   {
1020                         case 'case':
1021                                 if (is_array($this->caseFolding['utf-8']))      return 1;
1022
1023                                         // Use cached version if possible
1024                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
1025                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1026                                         return 2;
1027                                 }
1028                                 break;
1029
1030                         case 'ascii':
1031                                 if (is_array($this->toASCII['utf-8']))  return 1;
1032
1033                                         // Use cached version if possible
1034                                 if ($cacheFileASCII && @is_file($cacheFileASCII))       {
1035                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1036                                         return 2;
1037                                 }
1038                                 break;
1039                 }
1040
1041                         // process main Unicode data file
1042                 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
1043                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
1044
1045                 $fh = fopen($unicodeDataFile,'rb');
1046                 if (!$fh)       return false;
1047
1048                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1049                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1050                 $this->caseFolding['utf-8'] = array();
1051                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1052                 $utf8CaseFolding['toUpper'] = array();
1053                 $utf8CaseFolding['toLower'] = array();
1054                 $utf8CaseFolding['toTitle'] = array();
1055
1056                 $decomposition = array();       // array of temp. decompositions
1057                 $mark = array();                // array of chars that are marks (eg. composing accents)
1058                 $number = array();              // array of chars that are numbers (eg. digits)
1059                 $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
1060
1061                 while (!feof($fh))      {
1062                         $line = fgets($fh,4096);
1063                                 // has a lot of info
1064                         list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1065
1066                         $ord = hexdec($char);
1067                         if ($ord > 0xFFFF)      break;  // only process the BMP
1068
1069                         $utf8_char = $this->UnumberToChar($ord);
1070
1071                         if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1072                         if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1073                                 // store "title" only when different from "upper" (only a few)
1074                         if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1075
1076                         switch ($cat{0})        {
1077                                 case 'M':       // mark (accent, umlaut, ...)
1078                                         $mark["U+$char"] = 1;
1079                                         break;
1080
1081                                 case 'N':       // numeric value
1082                                         if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
1083                         }
1084
1085                                 // accented Latin letters without "official" decomposition
1086                         $match = array();
1087                         if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
1088                                 $c = ord($match[2]);
1089                                 if ($match[1] == 'SMALL')       $c += 32;
1090
1091                                 $decomposition["U+$char"] = array(dechex($c));
1092                                 continue;
1093                         }
1094
1095                         $match = array();
1096                         if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
1097                                 switch($match[1])       {
1098                                         case '<circle>':        // add parenthesis as circle replacement, eg (1)
1099                                                 $match[2] = '0028 '.$match[2].' 0029';
1100                                                 break;
1101
1102                                         case '<square>':        // add square brackets as square replacement, eg [1]
1103                                                 $match[2] = '005B '.$match[2].' 005D';
1104                                                 break;
1105
1106                                         case '<compat>':        // ignore multi char decompositions that start with a space
1107                                                 if (ereg('^0020 ',$match[2]))   continue 2;
1108                                                 break;
1109
1110                                                 // ignore Arabic and vertical layout presentation decomposition
1111                                         case '<initial>':
1112                                         case '<medial>':
1113                                         case '<final>':
1114                                         case '<isolated>':
1115                                         case '<vertical>':
1116                                                 continue 2;
1117                                 }
1118                                 $decomposition["U+$char"] = split(' ',$match[2]);
1119                         }
1120                 }
1121                 fclose($fh);
1122
1123                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1124                 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
1125                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
1126                         $fh = fopen($specialCasingFile,'rb');
1127                         if ($fh)        {
1128                                 while (!feof($fh))      {
1129                                         $line = fgets($fh,4096);
1130                                         if ($line{0} != '#' && trim($line) != '')       {
1131
1132                                                 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
1133                                                 if ($cond == '' || $cond{0} == '#')     {
1134                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
1135                                                         if ($char != $lower)    {
1136                                                                 $arr = split(' ',$lower);
1137                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1138                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1139                                                         }
1140                                                         if ($char != $title && $title != $upper)        {
1141                                                                 $arr = split(' ',$title);
1142                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1143                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1144                                                         }
1145                                                         if ($char != $upper)    {
1146                                                                         $arr = split(' ',$upper);
1147                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1148                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1149                                                         }
1150                                                 }
1151                                         }
1152                                 }
1153                                 fclose($fh);
1154                         }
1155                 }
1156
1157                         // process custom decompositions
1158                 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
1159                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
1160                         $fh = fopen($customTranslitFile,'rb');
1161                         if ($fh)        {
1162                                 while (!feof($fh))      {
1163                                         $line = fgets($fh,4096);
1164                                         if ($line{0} != '#' && trim($line) != '')       {
1165                                                 list($char,$translit) = t3lib_div::trimExplode(';', $line);
1166                                                 if (!$translit) $omit["U+$char"] = 1;
1167                                                 $decomposition["U+$char"] = split(' ', $translit);
1168
1169                                         }
1170                                 }
1171                                 fclose($fh);
1172                         }
1173                 }
1174
1175                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1176                 foreach($decomposition as $from => $to) {
1177                         $code_decomp = array();
1178
1179                         while ($code_value = array_shift($to))  {
1180                                 if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
1181                                         foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
1182                                                 array_unshift($to, $cv);
1183                                         }
1184                                 } elseif (!isset($mark["U+$code_value"])) {     // remove mark
1185                                         array_push($code_decomp, $code_value);
1186                                 }
1187                         }
1188                         if (count($code_decomp) || isset($omit[$from])) {
1189                                 $decomposition[$from] = $code_decomp;
1190                         } else {
1191                                 unset($decomposition[$from]);
1192                         }
1193                 }
1194
1195                         // create ascii only mapping
1196                 $this->toASCII['utf-8'] = array();
1197                 $ascii =& $this->toASCII['utf-8'];
1198
1199                 foreach($decomposition as $from => $to) {
1200                         $code_decomp = array();
1201                         while ($code_value = array_shift($to))  {
1202                                 $ord = hexdec($code_value);
1203                                 if ($ord > 127)
1204                                         continue 2;     // skip decompositions containing non-ASCII chars
1205                                 else
1206                                         array_push($code_decomp,chr($ord));
1207                         }
1208                         $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1209                 }
1210
1211                         // add numeric decompositions
1212                 foreach($number as $from => $to)        {
1213                         $utf8_char = $this->UnumberToChar(hexdec($from));
1214                         if (!isset($ascii[$utf8_char])) {
1215                                 $ascii[$utf8_char] = $to;
1216                         }
1217                 }
1218
1219                 if ($cacheFileCase)     {
1220                                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1221                 }
1222
1223                 if ($cacheFileASCII)    {
1224                                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1225                 }
1226
1227                 return 3;
1228         }
1229
1230         /**
1231          * This function initializes the folding table for a charset other than UTF-8.
1232          * This function is automatically called by the case folding functions.
1233          *
1234          * @param       string          Charset for which to initialize case folding.
1235          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1236          * @access private
1237          */
1238         function initCaseFolding($charset)      {
1239                         // Only process if the case table is not yet loaded:
1240                 if (is_array($this->caseFolding[$charset]))     return 1;
1241
1242                         // Use cached version if possible
1243                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1244                 if ($cacheFile && @is_file($cacheFile)) {
1245                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1246                         return 2;
1247                 }
1248
1249                         // init UTF-8 conversion for this charset
1250                 if (!$this->initCharset($charset))      {
1251                         return false;
1252                 }
1253
1254                         // UTF-8 case folding is used as the base conversion table
1255                 if (!$this->initUnicodeData('case'))    {
1256                         return false;
1257                 }
1258
1259                 $nochar = chr($this->noCharByteVal);
1260                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
1261                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1262                         $c = $this->utf8_decode($utf8, $charset);
1263
1264                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1265                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1266                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1267
1268                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1269                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1270                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
1271
1272                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1273                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1274                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1275                 }
1276
1277                         // add the ASCII case table
1278                 for ($i=ord('a'); $i<=ord('z'); $i++)   {
1279                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
1280                 }
1281                 for ($i=ord('A'); $i<=ord('Z'); $i++)   {
1282                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
1283                 }
1284
1285                 if ($cacheFile) {
1286                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
1287                 }
1288
1289                 return 3;
1290         }
1291
1292         /**
1293          * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1294          * This function is automatically called by the ASCII transliteration functions.
1295          *
1296          * @param       string          Charset for which to initialize conversion.
1297          * @return      integer         Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1298          * @access private
1299          */
1300         function initToASCII($charset)  {
1301                         // Only process if the case table is not yet loaded:
1302                 if (is_array($this->toASCII[$charset])) return 1;
1303
1304                         // Use cached version if possible
1305                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1306                 if ($cacheFile && @is_file($cacheFile)) {
1307                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1308                         return 2;
1309                 }
1310
1311                         // init UTF-8 conversion for this charset
1312                 if (!$this->initCharset($charset))      {
1313                         return false;
1314                 }
1315
1316                         // UTF-8/ASCII transliteration is used as the base conversion table
1317                 if (!$this->initUnicodeData('ascii'))   {
1318                         return false;
1319                 }
1320
1321                 $nochar = chr($this->noCharByteVal);
1322                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
1323                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1324                         $c = $this->utf8_decode($utf8, $charset);
1325
1326                         if (isset($this->toASCII['utf-8'][$utf8]))      {
1327                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1328                         }
1329                 }
1330
1331                 if ($cacheFile) {
1332                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
1333                 }
1334
1335                 return 3;
1336         }
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353         /********************************************
1354          *
1355          * String operation functions
1356          *
1357          ********************************************/
1358
1359         /**
1360          * Returns a part of a string.
1361          * Unit-tested by Kasper (single byte charsets only)
1362          *
1363          * @param       string          The character set
1364          * @param       string          Character string
1365          * @param       integer         Start position (character position)
1366          * @param       integer         Length (in characters)
1367          * @return      string          The substring
1368          * @see substr(), mb_substr()
1369          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1370          */
1371         function substr($charset,$string,$start,$len=null)      {
1372                 if ($len===0)   return '';
1373
1374                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1375                                 // cannot omit $len, when specifying charset
1376                         if ($len==null) {
1377                                 $enc = mb_internal_encoding();  // save internal encoding
1378                                 mb_internal_encoding($charset);
1379                                 $str = mb_substr($string,$start);
1380                                 mb_internal_encoding($enc);     // restore internal encoding
1381
1382                                 return $str;
1383                         }
1384                         else {
1385                                 return mb_substr($string,$start,$len,$charset);
1386                         }
1387                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1388                                 // cannot omit $len, when specifying charset
1389                         if ($len==null) {
1390                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1391                                 iconv_set_encoding('internal_encoding',$charset);
1392                                 $str = iconv_substr($string,$start);
1393                                 iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
1394
1395                                 return $str;
1396                         }
1397                         else {
1398                                 return iconv_substr($string,$start,$len,$charset);
1399                         }
1400                 } elseif ($charset == 'utf-8')  {
1401                         return $this->utf8_substr($string,$start,$len);
1402                 } elseif ($this->eucBasedSets[$charset])        {
1403                         return $this->euc_substr($string,$start,$charset,$len);
1404                 } elseif ($this->twoByteSets[$charset]) {
1405                         return substr($string,$start*2,$len*2);
1406                 } elseif ($this->fourByteSets[$charset])        {
1407                         return substr($string,$start*4,$len*4);
1408                 }
1409
1410                 // treat everything else as single-byte encoding
1411                 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
1412         }
1413
1414         /**
1415          * Counts the number of characters.
1416          * Unit-tested by Kasper (single byte charsets only)
1417          *
1418          * @param       string          The character set
1419          * @param       string          Character string
1420          * @return      integer         The number of characters
1421          * @see strlen()
1422          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1423          */
1424         function strlen($charset,$string)       {
1425                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1426                         return mb_strlen($string,$charset);
1427                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1428                         return iconv_strlen($string,$charset);
1429                 } elseif ($charset == 'utf-8')  {
1430                         return $this->utf8_strlen($string);
1431                 } elseif ($this->eucBasedSets[$charset])        {
1432                         return $this->euc_strlen($string,$charset);
1433                 } elseif ($this->twoByteSets[$charset]) {
1434                         return strlen($string)/2;
1435                 } elseif ($this->fourByteSets[$charset])        {
1436                         return strlen($string)/4;
1437                 }
1438                 // treat everything else as single-byte encoding
1439                 return strlen($string);
1440         }
1441
1442         /**
1443          * Truncates a string and pre-/appends a string.
1444          * Unit tested by Kasper
1445          *
1446          * @param       string          The character set
1447          * @param       string          Character string
1448          * @param       integer         Length (in characters)
1449          * @param       string          Crop signifier
1450          * @return      string          The shortened string
1451          * @see substr(), mb_strimwidth()
1452          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1453          */
1454         function crop($charset,$string,$len,$crop='')   {
1455                 if (intval($len) == 0)  return $string;
1456
1457                 if ($charset == 'utf-8')        {
1458                         $i = $this->utf8_char2byte_pos($string,$len);
1459                 } elseif ($this->eucBasedSets[$charset])        {
1460                         $i = $this->euc_char2byte_pos($string,$len,$charset);
1461                 } else {
1462                         if ($len > 0)   {
1463                                 $i = $len;
1464                         } else {
1465                                 $i = strlen($string)+$len;
1466                                 if ($i<=0)      $i = false;
1467                         }
1468                 }
1469
1470                 if ($i === false)       {       // $len outside actual string length
1471                         return $string;
1472                 } else  {
1473                         if ($len > 0)   {
1474                                 if (strlen($string{$i}))        {
1475                                         return substr($string,0,$i).$crop;
1476
1477                                 }
1478                         } else {
1479                                 if (strlen($string{$i-1}))      {
1480                                         return $crop.substr($string,$i);
1481                                 }
1482                         }
1483
1484 /*
1485                         if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1486                                 if ($len > 0)   {
1487                                         return substr($string,0,$i).$crop;
1488                                 } else {
1489                                         return $crop.substr($string,$i);
1490                                 }
1491                         }
1492 */
1493                 }
1494                 return $string;
1495         }
1496
1497         /**
1498          * Cuts a string short at a given byte length.
1499          *
1500          * @param       string          The character set
1501          * @param       string          Character string
1502          * @param       integer         The byte length
1503          * @return      string          The shortened string
1504          * @see mb_strcut()
1505          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1506          */
1507         function strtrunc($charset,$string,$len)        {
1508                 if ($len <= 0)  return '';
1509
1510                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1511                         return mb_strcut($string,0,$len,$charset);
1512                 } elseif ($charset == 'utf-8')  {
1513                         return $this->utf8_strtrunc($string,$len);
1514                 } elseif ($this->eucBasedSets[$charset])        {
1515                         return $this->euc_strtrunc($string,$charset);
1516                 } elseif ($this->twoByteSets[$charset]) {
1517                         if ($len % 2)   $len--;         // don't cut at odd positions
1518                 } elseif ($this->fourByteSets[$charset])        {
1519                         $x = $len % 4;
1520                         $len -= $x;     // realign to position dividable by four
1521                 }
1522                 // treat everything else as single-byte encoding
1523                 return substr($string,0,$len);
1524         }
1525
1526         /**
1527          * Translates all characters of a string into their respective case values.
1528          * Unlike strtolower() and strtoupper() this method is locale independent.
1529          * Note that the string length may change!
1530          * eg. lower case German �(sharp S) becomes upper case "SS"
1531          * Unit-tested by Kasper
1532          * Real case folding is language dependent, this method ignores this fact.
1533          *
1534          * @param       string          Character set of string
1535          * @param       string          Input string to convert case for
1536          * @param       string          Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1537          * @return      string          The converted string
1538          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1539          * @see strtolower(), strtoupper()
1540          */
1541         function conv_case($charset,$string,$case)      {
1542                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3)   {
1543                         if ($case == 'toLower') {
1544                                 $string = mb_strtolower($string,$charset);
1545                         } else {
1546                                 $string = mb_strtoupper($string,$charset);
1547                         }
1548                 } elseif ($charset == 'utf-8')  {
1549                         $string = $this->utf8_char_mapping($string,'case',$case);
1550                 } elseif (isset($this->eucBasedSets[$charset])) {
1551                         $string = $this->euc_char_mapping($string,$charset,'case',$case);
1552                 } else {
1553                                 // treat everything else as single-byte encoding
1554                         $string = $this->sb_char_mapping($string,$charset,'case',$case);
1555                 }
1556
1557                 return $string;
1558         }
1559
1560         /**
1561          * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1562          *
1563          * @param       string          Character set of string
1564          * @param       string          Input string to convert
1565          * @return      string          The converted string
1566          */
1567         function specCharsToASCII($charset,$string)     {
1568                 if ($charset == 'utf-8')        {
1569                         $string = $this->utf8_char_mapping($string,'ascii');
1570                 } elseif (isset($this->eucBasedSets[$charset])) {
1571                         $string = $this->euc_char_mapping($string,$charset,'ascii');
1572                 } else {
1573                                 // treat everything else as single-byte encoding
1574                         $string = $this->sb_char_mapping($string,$charset,'ascii');
1575                 }
1576
1577                 return $string;
1578         }
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591         /********************************************
1592          *
1593          * Internal string operation functions
1594          *
1595          ********************************************/
1596
1597         /**
1598          * Maps all characters of a string in a single byte charset.
1599          *
1600          * @param       string          the string
1601          * @param       string          the charset
1602          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1603          * @param       string          'case': conversion 'toLower' or 'toUpper'
1604          * @return      string          the converted string
1605          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1606          */
1607         function sb_char_mapping($str,$charset,$mode,$opt='')   {
1608                 switch($mode)   {
1609                         case 'case':
1610                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
1611                                 $map =& $this->caseFolding[$charset][$opt];
1612                                 break;
1613
1614                         case 'ascii':
1615                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
1616                                 $map =& $this->toASCII[$charset];
1617                                 break;
1618
1619                         default:
1620                                 return $str;
1621                 }
1622
1623                 $out = '';
1624                 for($i=0; strlen($str{$i}); $i++)       {
1625                         $c = $str{$i};
1626                         if (isset($map[$c]))    {
1627                                 $out .= $map[$c];
1628                         } else {
1629                                 $out .= $c;
1630                         }
1631                 }
1632
1633                 return $out;
1634         }
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645         /********************************************
1646          *
1647          * Internal UTF-8 string operation functions
1648          *
1649          ********************************************/
1650
1651         /**
1652          * Returns a part of a UTF-8 string.
1653          * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1654          *
1655          * @param       string          UTF-8 string
1656          * @param       integer         Start position (character position)
1657          * @param       integer         Length (in characters)
1658          * @return      string          The substring
1659          * @see substr()
1660          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1661          */
1662         function utf8_substr($str,$start,$len=null)     {
1663                 if (!strcmp($len,'0'))  return '';
1664
1665                 $byte_start = $this->utf8_char2byte_pos($str,$start);
1666                 if ($byte_start === false)      {
1667                         if ($start > 0) {
1668                                 return false;   // $start outside string length
1669                         } else {
1670                                 $start = 0;
1671                         }
1672                 }
1673
1674                 $str = substr($str,$byte_start);
1675
1676                 if ($len!=null) {
1677                         $byte_end = $this->utf8_char2byte_pos($str,$len);
1678                         if ($byte_end === false)        // $len outside actual string length
1679                                 return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
1680                         else
1681                                 return substr($str,0,$byte_end);
1682                 }
1683                 else    return $str;
1684         }
1685
1686         /**
1687          * Counts the number of characters of a string in UTF-8.
1688          * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1689          *
1690          * @param       string          UTF-8 multibyte character string
1691          * @return      integer         The number of characters
1692          * @see strlen()
1693          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1694          */
1695         function utf8_strlen($str)      {
1696                 $n=0;
1697                 for($i=0; strlen($str{$i}); $i++)       {
1698                         $c = ord($str{$i});
1699                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1700                                 $n++;
1701                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1702                                 $n++;
1703                 }
1704                 return $n;
1705         }
1706
1707         /**
1708          * Truncates a string in UTF-8 short at a given byte length.
1709          *
1710          * @param       string          UTF-8 multibyte character string
1711          * @param       integer         the byte length
1712          * @return      string          the shortened string
1713          * @see mb_strcut()
1714          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1715          */
1716         function utf8_strtrunc($str,$len)       {
1717                 $i = $len-1;
1718                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1719                         for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
1720                         if ($i <= 0)    return ''; // sanity check
1721                         for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
1722                         if ($bc+$i > $len)      return substr($str,0,$i);
1723                         // fallthru: multibyte char fits into length
1724                 }
1725                 return substr($str,0,$len);
1726         }
1727
1728         /**
1729          * Find position of first occurrence of a string, both arguments are in UTF-8.
1730          *
1731          * @param       string          UTF-8 string to search in
1732          * @param       string          UTF-8 string to search for
1733          * @param       integer         Positition to start the search
1734          * @return      integer         The character position
1735          * @see strpos()
1736          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1737          */
1738         function utf8_strpos($haystack,$needle,$offset=0)       {
1739                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1740                         return mb_strpos($haystack,$needle,$offset,'utf-8');
1741                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1742                         return iconv_strpos($haystack,$needle,$offset,'utf-8');
1743                 }
1744
1745                 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1746                 if ($byte_offset === false)     return false; // offset beyond string length
1747
1748                 $byte_pos = strpos($haystack,$needle,$byte_offset);
1749                 if ($byte_pos === false)        return false; // needle not found
1750
1751                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1752         }
1753
1754         /**
1755          * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1756          *
1757          * @param       string          UTF-8 string to search in
1758          * @param       string          UTF-8 character to search for (single character)
1759          * @return      integer         The character position
1760          * @see strrpos()
1761          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1762          */
1763         function utf8_strrpos($haystack,$needle)        {
1764                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1765                         return mb_strrpos($haystack,$needle,'utf-8');
1766                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
1767                         return iconv_strrpos($haystack,$needle,'utf-8');
1768                 }
1769
1770                 $byte_pos = strrpos($haystack,$needle);
1771                 if ($byte_pos === false)        return false; // needle not found
1772
1773                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1774         }
1775
1776         /**
1777          * Translates a character position into an 'absolute' byte position.
1778          * Unit tested by Kasper.
1779          *
1780          * @param       string          UTF-8 string
1781          * @param       integer         Character position (negative values start from the end)
1782          * @return      integer         Byte position
1783          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1784          */
1785         function utf8_char2byte_pos($str,$pos)  {
1786                 $n = 0;                         // number of characters found
1787                 $p = abs($pos);         // number of characters wanted
1788
1789                 if ($pos >= 0)  {
1790                         $i = 0;
1791                         $d = 1;
1792                 } else {
1793                         $i = strlen($str)-1;
1794                         $d = -1;
1795                 }
1796
1797                 for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
1798                         $c = (int)ord($str{$i});
1799                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1800                                 $n++;
1801                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1802                                 $n++;
1803                 }
1804                 if (!strlen($str{$i}))  return false; // offset beyond string length
1805
1806                 if ($pos >= 0)  {
1807                                 // skip trailing multi-byte data bytes
1808                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
1809                 } else {
1810                                 // correct offset
1811                         $i++;
1812                 }
1813
1814                 return $i;
1815         }
1816
1817         /**
1818          * Translates an 'absolute' byte position into a character position.
1819          * Unit tested by Kasper.
1820          *
1821          * @param       string          UTF-8 string
1822          * @param       integer         byte position
1823          * @return      integer         character position
1824          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1825          */
1826         function utf8_byte2char_pos($str,$pos)  {
1827                 $n = 0; // number of characters
1828                 for($i=$pos; $i>0; $i--)        {
1829                         $c = (int)ord($str{$i});
1830                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1831                                 $n++;
1832                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
1833                                 $n++;
1834                 }
1835                 if (!strlen($str{$i}))  return false; // offset beyond string length
1836
1837                 return $n;
1838         }
1839
1840         /**
1841          * Maps all characters of an UTF-8 string.
1842          *
1843          * @param       string          UTF-8 string
1844          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1845          * @param       string          'case': conversion 'toLower' or 'toUpper'
1846          * @return      string          the converted string
1847          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1848          */
1849         function utf8_char_mapping($str,$mode,$opt='')  {
1850                 if (!$this->initUnicodeData($mode))     return $str;    // do nothing
1851
1852                 $out = '';
1853                 switch($mode)   {
1854                         case 'case':
1855                                 $map =& $this->caseFolding['utf-8'][$opt];
1856                                 break;
1857
1858                         case 'ascii':
1859                                 $map =& $this->toASCII['utf-8'];
1860                                 break;
1861
1862                         default:
1863                                 return $str;
1864                 }
1865
1866                 for($i=0; strlen($str{$i}); $i++)       {
1867                         $c = ord($str{$i});
1868                         if (!($c & 0x80))       // single-byte (0xxxxxx)
1869                                 $mbc = $str{$i};
1870                         elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
1871                                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
1872                                 $mbc = substr($str,$i,$bc);
1873                                 $i += $bc-1;
1874                         }
1875
1876                         if (isset($map[$mbc]))  {
1877                                 $out .= $map[$mbc];
1878                         } else {
1879                                 $out .= $mbc;
1880                         }
1881                 }
1882
1883                 return $out;
1884         }
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903         /********************************************
1904          *
1905          * Internal EUC string operation functions
1906          *
1907          * Extended Unix Code:
1908          *  ASCII compatible 7bit single bytes chars
1909          *  8bit two byte chars
1910          *
1911          * Shift-JIS is treated as a special case.
1912          *
1913          ********************************************/
1914
1915         /**
1916          * Cuts a string in the EUC charset family short at a given byte length.
1917          *
1918          * @param       string          EUC multibyte character string
1919          * @param       integer         the byte length
1920          * @param       string          the charset
1921          * @return      string          the shortened string
1922          * @see mb_strcut()
1923          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1924          */
1925         function euc_strtrunc($str,$len,$charset)        {
1926                 $sjis = ($charset == 'shift_jis');
1927                 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
1928                         $c = ord($str{$i});
1929                         if ($sjis)      {
1930                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
1931                         }
1932                         else    {
1933                                 if ($c >= 0x80) $i++;   // advance a double-byte char
1934                         }
1935                 }
1936                 if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
1937
1938                 if ($i>$len)
1939                         return substr($str,0,$len-1);   // we ended on a first byte
1940                 else
1941                         return substr($str,0,$len);
1942         }
1943
1944         /**
1945          * Returns a part of a string in the EUC charset family.
1946          *
1947          * @param       string          EUC multibyte character string
1948          * @param       integer         start position (character position)
1949          * @param       string          the charset
1950          * @param       integer         length (in characters)
1951          * @return      string          the substring
1952          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1953          */
1954         function euc_substr($str,$start,$charset,$len=null)     {
1955                 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1956                 if ($byte_start === false)      return false;   // $start outside string length
1957
1958                 $str = substr($str,$byte_start);
1959
1960                 if ($len!=null) {
1961                         $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1962                         if ($byte_end === false)        // $len outside actual string length
1963                                 return $str;
1964                         else
1965                                 return substr($str,0,$byte_end);
1966                 }
1967                 else    return $str;
1968         }
1969
1970         /**
1971          * Counts the number of characters of a string in the EUC charset family.
1972          *
1973          * @param       string          EUC multibyte character string
1974          * @param       string          the charset
1975          * @return      integer         the number of characters
1976          * @see strlen()
1977          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
1978          */
1979         function euc_strlen($str,$charset)       {
1980                 $sjis = ($charset == 'shift_jis');
1981                 $n=0;
1982                 for ($i=0; strlen($str{$i}); $i++) {
1983                         $c = ord($str{$i});
1984                         if ($sjis)      {
1985                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
1986                         }
1987                         else    {
1988                                 if ($c >= 0x80) $i++;   // advance a double-byte char
1989                         }
1990
1991                         $n++;
1992                 }
1993
1994                 return $n;
1995         }
1996
1997         /**
1998          * Translates a character position into an 'absolute' byte position.
1999          *
2000          * @param       string          EUC multibyte character string
2001          * @param       integer         character position (negative values start from the end)
2002          * @param       string          the charset
2003          * @return      integer         byte position
2004          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2005          */
2006         function euc_char2byte_pos($str,$pos,$charset)  {
2007                 $sjis = ($charset == 'shift_jis');
2008                 $n = 0; // number of characters seen
2009                 $p = abs($pos); // number of characters wanted
2010
2011                 if ($pos >= 0)  {
2012                         $i = 0;
2013                         $d = 1;
2014                 } else {
2015                         $i = strlen($str)-1;
2016                         $d = -1;
2017                 }
2018
2019                 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
2020                         $c = ord($str{$i});
2021                         if ($sjis)      {
2022                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
2023                         }
2024                         else    {
2025                                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
2026                         }
2027
2028                         $n++;
2029                 }
2030                 if (!strlen($str{$i}))  return false; // offset beyond string length
2031
2032                 if ($pos < 0)   $i++;   // correct offset
2033
2034                 return $i;
2035         }
2036
2037         /**
2038          * Maps all characters of a string in the EUC charset family.
2039          *
2040          * @param       string          EUC multibyte character string
2041          * @param       string          the charset
2042          * @param       string          mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2043          * @param       string          'case': conversion 'toLower' or 'toUpper'
2044          * @return      string          the converted string
2045          * @author      Martin Kutschker <martin.t.kutschker@blackbox.net>
2046          */
2047         function euc_char_mapping($str,$charset,$mode,$opt='')  {
2048                 switch($mode)   {
2049                         case 'case':
2050                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
2051                                 $map =& $this->caseFolding[$charset][$opt];
2052                                 break;
2053
2054                         case 'ascii':
2055                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
2056                                 $map =& $this->toASCII[$charset];
2057                                 break;
2058
2059                         default:
2060                                 return $str;
2061                 }
2062
2063                 $sjis = ($charset == 'shift_jis');
2064                 $out = '';
2065                 for($i=0; strlen($str{$i}); $i++)       {
2066                         $mbc = $str{$i};
2067                         $c = ord($mbc);
2068
2069                         if ($sjis)      {
2070                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
2071                                         $mbc = substr($str,$i,2);
2072                                         $i++;
2073                                 }
2074                         }
2075                         else    {
2076                                 if ($c >= 0x80) {       // a double-byte char
2077                                         $mbc = substr($str,$i,2);
2078                                         $i++;
2079                                 }
2080                         }
2081
2082                         if (isset($map[$mbc]))  {
2083                                 $out .= $map[$mbc];
2084                         } else {
2085                                 $out .= $mbc;
2086                         }
2087                 }
2088
2089                 return $out;
2090         }
2091
2092 }
2093
2094 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])        {
2095         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2096 }
2097 ?>