2 /***************************************************************
5 * (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
8 * This script is part of the Typo3 project. The Typo3 project is
9 * free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * The GNU General Public License can be found at
15 * http://www.gnu.org/copyleft/gpl.html.
17 * This script is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * This copyright notice MUST APPEAR in all copies of the script!
23 ***************************************************************/
25 * Class for conversion between charsets.
27 * Typo Id: class.t3lib_cs.php,v 1.56 2006/05/03 08:47:30 masi Exp $
30 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
31 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
34 * [CLASS/FUNCTION INDEX of SCRIPT]
39 * 488: function parse_charset($charset)
40 * 507: function get_locale_charset($locale)
42 * SECTION: Charset Conversion functions
43 * 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
44 * 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
45 * 617: function utf8_encode($str,$charset)
46 * 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
47 * 706: function utf8_to_entities($str)
48 * 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
49 * 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
50 * 823: function UnumberToChar($cbyte)
51 * 868: function utf8CharToUnumber($str,$hex=0)
53 * SECTION: Init functions
54 * 911: function initCharset($charset)
55 * 973: function initUnicodeData($mode=null)
56 * 1198: function initCaseFolding($charset)
57 * 1260: function initToASCII($charset)
59 * SECTION: String operation functions
60 * 1331: function substr($charset,$string,$start,$len=null)
61 * 1384: function strlen($charset,$string)
62 * 1414: function crop($charset,$string,$len,$crop='')
63 * 1467: function strtrunc($charset,$string,$len)
64 * 1501: function conv_case($charset,$string,$case)
65 * 1527: function specCharsToASCII($charset,$string)
67 * SECTION: Internal string operation functions
68 * 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
70 * SECTION: Internal UTF-8 string operation functions
71 * 1622: function utf8_substr($str,$start,$len=null)
72 * 1655: function utf8_strlen($str)
73 * 1676: function utf8_strtrunc($str,$len)
74 * 1698: function utf8_strpos($haystack,$needle,$offset=0)
75 * 1723: function utf8_strrpos($haystack,$needle)
76 * 1745: function utf8_char2byte_pos($str,$pos)
77 * 1786: function utf8_byte2char_pos($str,$pos)
78 * 1809: function utf8_char_mapping($str,$mode,$opt='')
80 * SECTION: Internal EUC string operation functions
81 * 1885: function euc_strtrunc($str,$len,$charset)
82 * 1914: function euc_substr($str,$start,$charset,$len=null)
83 * 1939: function euc_strlen($str,$charset)
84 * 1966: function euc_char2byte_pos($str,$pos,$charset)
85 * 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
88 * (This index is automatically created/updated by the extension "extdeveval")
102 * Functions working on UTF-8 strings:
107 * - implode/explode/join
109 * Functions nearly working on UTF-8 strings:
111 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
112 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
113 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
114 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
116 * Functions NOT working on UTF-8 strings:
130 * Class for conversion between charsets
132 * @author Kasper Skaarhoj <kasperYYYY@typo3.com>
133 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
138 var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
140 // This is the array where parsed conversion tables are stored (cached)
141 var $parsedCharsets=array();
143 // An array where case folding data will be stored (cached)
144 var $caseFolding=array();
146 // An array where charset-to-ASCII mappings are stored (cached)
147 var $toASCII=array();
149 // This tells the converter which charsets has two bytes per char:
150 var $twoByteSets=array(
151 'ucs-2'=>1, // 2-byte Unicode
154 // This tells the converter which charsets has four bytes per char:
155 var $fourByteSets=array(
156 'ucs-4'=>1, // 4-byte Unicode
157 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
160 // This tells the converter which charsets use a scheme like the Extended Unix Code:
161 var $eucBasedSets=array(
162 'gb2312'=>1, // Chinese, simplified.
163 'big5'=>1, // Chinese, traditional.
164 'euc-kr'=>1, // Korean
165 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
168 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
169 // http://czyborra.com/charsets/iso8859.html
172 'us-ascii'=> 'ascii',
173 'cp819' => 'iso-8859-1',
174 'ibm819' => 'iso-8859-1',
175 'iso-ir-100' => 'iso-8859-1',
176 'iso-ir-109' => 'iso-8859-2',
177 'iso-ir-148' => 'iso-8859-9',
178 'iso-ir-199' => 'iso-8859-14',
179 'iso-ir-203' => 'iso-8859-15',
180 'csisolatin1' => 'iso-8859-1',
181 'csisolatin2' => 'iso-8859-2',
182 'csisolatin3' => 'iso-8859-3',
183 'csisolatin5' => 'iso-8859-9',
184 'csisolatin8' => 'iso-8859-14',
185 'csisolatin9' => 'iso-8859-15',
186 'csisolatingreek' => 'iso-8859-7',
187 'iso-celtic' => 'iso-8859-14',
188 'latin1' => 'iso-8859-1',
189 'latin2' => 'iso-8859-2',
190 'latin3' => 'iso-8859-3',
191 'latin5' => 'iso-8859-9',
192 'latin6' => 'iso-8859-10',
193 'latin8' => 'iso-8859-14',
194 'latin9' => 'iso-8859-15',
195 'l1' => 'iso-8859-1',
196 'l2' => 'iso-8859-2',
197 'l3' => 'iso-8859-3',
198 'l5' => 'iso-8859-9',
199 'l6' => 'iso-8859-10',
200 'l8' => 'iso-8859-14',
201 'l9' => 'iso-8859-15',
202 'cyrillic' => 'iso-8859-5',
203 'arabic' => 'iso-8859-6',
204 'tis-620' => 'iso-8859-11',
205 'win874' => 'windows-874',
206 'win1250' => 'windows-1250',
207 'win1251' => 'windows-1251',
208 'win1252' => 'windows-1252',
209 'win1253' => 'windows-1253',
210 'win1254' => 'windows-1254',
211 'win1255' => 'windows-1255',
212 'win1256' => 'windows-1256',
213 'win1257' => 'windows-1257',
214 'win1258' => 'windows-1258',
215 'cp1250' => 'windows-1250',
216 'cp1251' => 'windows-1251',
217 'cp1252' => 'windows-1252',
218 'ms-ee' => 'windows-1250',
219 'ms-ansi' => 'windows-1252',
220 'ms-greek' => 'windows-1253',
221 'ms-turk' => 'windows-1254',
222 'winbaltrim' => 'windows-1257',
223 'koi-8ru' => 'koi-8r',
227 'macintosh' => 'macroman',
228 'euc-cn' => 'gb2312',
229 'x-euc-cn' => 'gb2312',
235 'sjis' => 'shift_jis',
236 'shift-jis' => 'shift_jis',
237 'cp932' => 'shift_jis',
248 // mapping of iso-639:2 language codes to script names
249 var $lang_to_script=array(
250 // iso-639:2 language codes, see:
251 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm
252 // http://www.loc.gov/standards/iso639-2/langcodes.html
253 // http://www.unicode.org/onlinedat/languages.html
255 'bg' => 'cyrillic', // Bulgarian
256 'bs' => 'east_european', // Bosnian
257 'cs' => 'east_european', // Czech
258 'da' => 'west_european', // Danish
259 'de' => 'west_european', // German
260 'es' => 'west_european', // Spanish
262 'eo' => 'unicode', // Esperanto
263 'eu' => 'west_european', // Basque
264 'fa' => 'arabic', // Persian
265 'fi' => 'west_european', // Finish
266 'fo' => 'west_european', // Faroese
267 'fr' => 'west_european', // French
269 'he' => 'hebrew', // Hebrew (since 1998)
270 'hi' => 'unicode', // Hindi
271 'hr' => 'east_european', // Croatian
272 'hu' => 'east_european', // Hungarian
273 'iw' => 'hebrew', // Hebrew (til 1998)
274 'is' => 'west_european', // Icelandic
275 'it' => 'west_european', // Italian
277 'kl' => 'west_european', // Greenlandic
279 'lt' => 'lithuanian',
280 'lv' => 'west_european', // Latvian/Lettish
281 'nl' => 'west_european', // Dutch
282 'no' => 'west_european', // Norwegian
283 'pl' => 'east_european', // Polish
284 'pt' => 'west_european', // Portuguese
285 'ro' => 'east_european', // Romanian
286 'ru' => 'cyrillic', // Russian
287 'sk' => 'east_european', // Slovak
288 'sl' => 'east_european', // Slovenian
289 'sr' => 'cyrillic', // Serbian
290 'sv' => 'west_european', // Swedish
292 'uk' => 'cyrillic', // Ukranian
293 'vi' => 'vietnamese',
295 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
296 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
298 'bgr' => 'cyrillic', // Bulgarian
299 'cat' => 'west_european', // Catalan
300 'chs' => 'simpl_chinese',
301 'cht' => 'trad_chinese',
302 'csy' => 'east_european', // Czech
303 'dan' => 'west_european', // Danisch
304 'deu' => 'west_european', // German
305 'dea' => 'west_european', // German (Austrian)
306 'des' => 'west_european', // German (Swiss)
307 'ena' => 'west_european', // English (Australian)
308 'enc' => 'west_european', // English (Canadian)
309 'eng' => 'west_european', // English
310 'enz' => 'west_european', // English (New Zealand)
311 'enu' => 'west_european', // English (United States)
312 'euq' => 'west_european', // Basque
313 'fos' => 'west_european', // Faroese
314 'far' => 'arabic', // Persian
315 'fin' => 'west_european', // Finish
316 'fra' => 'west_european', // French
317 'frb' => 'west_european', // French (Belgian)
318 'frc' => 'west_european', // French (Canadian)
319 'frs' => 'west_european', // French (Swiss)
322 'hin' => 'unicode', // Hindi
323 'hun' => 'east_european', // Hungarian
324 'isl' => 'west_euorpean', // Icelandic
325 'ita' => 'west_european', // Italian
326 'its' => 'west_european', // Italian (Swiss)
329 'lth' => 'lithuanian',
330 'lvi' => 'west_european', // Latvian/Lettish
331 'msl' => 'west_european', // Malay
332 'nlb' => 'west_european', // Dutch (Belgian)
333 'nld' => 'west_european', // Dutch
334 'nor' => 'west_european', // Norwegian (bokmal)
335 'non' => 'west_european', // Norwegian (nynorsk)
336 'plk' => 'east_european', // Polish
337 'ptg' => 'west_european', // Portuguese
338 'ptb' => 'west_european', // Portuguese (Brazil)
339 'rom' => 'east_european', // Romanian
340 'rus' => 'cyrillic', // Russian
341 'slv' => 'east_european', // Slovenian
342 'sky' => 'east_european', // Slovak
343 'srl' => 'east_european', // Serbian (Latin)
344 'srb' => 'cyrillic', // Serbian (Cyrillic)
345 'esp' => 'west_european', // Spanish (trad. sort)
346 'esm' => 'west_european', // Spanish (Mexican)
347 'esn' => 'west_european', // Spanish (internat. sort)
348 'sve' => 'west_european', // Swedish
351 'ukr' => 'cyrillic', // Ukrainian
352 // English language names
353 'arabic' => 'arabic',
354 'basque' => 'west_european',
355 'bosnian' => 'east_european',
356 'bulgarian' => 'east_european',
357 'catalan' => 'west_european',
358 'croatian' => 'east_european',
359 'czech' => 'east_european',
360 'danish' => 'west_european',
361 'dutch' => 'west_european',
362 'english' => 'west_european',
363 'esperanto' => 'unicode',
364 'estonian' => 'estonian',
365 'faroese' => 'west_european',
367 'finnish' => 'west_european',
368 'french' => 'west_european',
369 'galician' => 'west_european',
370 'german' => 'west_european',
372 'greenlandic' => 'west_european',
373 'hebrew' => 'hebrew',
374 'hindi' => 'unicode',
375 'hungarian' => 'east_european',
376 'icelandic' => 'west_european',
377 'italian' => 'west_european',
378 'latvian' => 'west_european',
379 'lettish' => 'west_european',
380 'lithuanian' => 'lithuanian',
381 'malay' => 'west_european',
382 'norwegian' => 'west_european',
383 'persian' => 'arabic',
384 'polish' => 'east_european',
385 'portuguese' => 'west_european',
386 'russian' => 'cyrillic',
387 'romanian' => 'east_european',
388 'serbian' => 'cyrillic',
389 'slovak' => 'east_european',
390 'slovenian' => 'east_european',
391 'spanish' => 'west_european',
392 'svedish' => 'west_european',
394 'turkish' => 'turkish',
395 'ukrainian' => 'cyrillic',
398 // mapping of language (family) names to charsets on Unix
399 var $script_to_charset_unix=array(
400 'west_european' => 'iso-8859-1',
401 'estonian' => 'iso-8859-1',
402 'east_european' => 'iso-8859-2',
403 'baltic' => 'iso-8859-4',
404 'cyrillic' => 'iso-8859-5',
405 'arabic' => 'iso-8859-6',
406 'greek' => 'iso-8859-7',
407 'hebrew' => 'iso-8859-8',
408 'turkish' => 'iso-8859-9',
409 'thai' => 'iso-8859-11', // = TIS-620
410 'lithuanian' => 'iso-8859-13',
411 'chinese' => 'gb2312', // = euc-cn
412 'japanese' => 'euc-jp',
413 'korean' => 'euc-kr',
414 'simpl_chinese' => 'gb2312',
415 'trad_chinese' => 'big5',
417 'unicode' => 'utf-8',
420 // mapping of language (family) names to charsets on Windows
421 var $script_to_charset_windows=array(
422 'east_european' => 'windows-1250',
423 'cyrillic' => 'windows-1251',
424 'west_european' => 'windows-1252',
425 'greek' => 'windows-1253',
426 'turkish' => 'windows-1254',
427 'hebrew' => 'windows-1255',
428 'arabic' => 'windows-1256',
429 'baltic' => 'windows-1257',
430 'estonian' => 'windows-1257',
431 'lithuanian' => 'windows-1257',
432 'vietnamese' => 'windows-1258',
435 'chinese' => 'gb2312',
436 'japanese' => 'shift_jis',
437 'simpl_chinese' => 'gb2312',
438 'trad_chinese' => 'big5',
441 // mapping of locale names to charsets
442 var $locale_to_charset=array(
443 'japanese.euc' => 'euc-jp',
444 'ja_jp.ujis' => 'euc-jp',
445 'korean.euc' => 'euc-kr',
446 'sr@Latn' => 'iso-8859-2',
452 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
453 // Empty values means "iso-8859-1"
454 var $charSetArray = array(
462 'cz' => 'windows-1250',
463 'pl' => 'iso-8859-2',
464 'si' => 'windows-1250',
466 'tr' => 'iso-8859-9',
469 'ru' => 'windows-1251',
470 'ro' => 'iso-8859-2',
472 'sk' => 'windows-1250',
473 'lt' => 'windows-1257',
475 'hr' => 'windows-1250',
476 'hu' => 'iso-8859-2',
478 'th' => 'iso-8859-11',
479 'gr' => 'iso-8859-7',
482 'bg' => 'windows-1251',
484 'et' => 'iso-8859-4',
485 'ar' => 'iso-8859-6',
487 'ua' => 'windows-1251',
491 'ca' => 'iso-8859-15',
492 'ba' => 'iso-8859-2',
502 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
503 // Missing keys means: same as Typo3
504 var $isoArray = array(
522 * Normalize - changes input character set to lowercase letters.
524 * @param string Input charset
525 * @return string Normalized charset
526 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
528 function parse_charset($charset) {
529 $charset = strtolower($charset);
530 if (isset($this->synonyms
[$charset])) $charset = $this->synonyms
[$charset];
536 * Get the charset of a locale.
539 * ln_CN language / country
540 * ln_CN.cs language / country / charset
541 * ln_CN.cs@mod language / country / charset / modifier
543 * @param string Locale string
544 * @return string Charset resolved for locale string
545 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
547 function get_locale_charset($locale) {
548 $locale = strtolower($locale);
550 // exact locale specific charset?
551 if (isset($this->locale_to_charset
[$locale])) return $this->locale_to_charset
[$locale];
554 list($locale,$modifier) = explode('@',$locale);
556 // locale contains charset: use it
557 list($locale,$charset) = explode('.',$locale);
558 if ($charset) return $this->parse_charset($charset);
560 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
561 if ($modifier == 'euro') return 'iso-8859-15';
564 list($language,$country) = explode('_',$locale);
565 if (isset($this->lang_to_script
[$language])) $script = $this->lang_to_script
[$language];
567 if (TYPO3_OS
== 'WIN') {
568 $cs = $this->script_to_charset_windows
[$script] ?
$this->script_to_charset_windows
[$script] : 'window-1252';
570 $cs = $this->script_to_charset_unix
[$script] ?
$this->script_to_charset_unix
[$script] : 'iso-8859-1';
584 /********************************************
586 * Charset Conversion functions
588 ********************************************/
591 * Convert from one charset to another charset.
593 * @param string Input string
594 * @param string From charset (the current charset of the string)
595 * @param string To charset (the output charset wanted)
596 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
597 * @return string Converted string
600 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
601 if ($fromCS==$toCS) return $str;
603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
604 if ($toCS=='utf-8' ||
!$useEntityForNoChar) {
605 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
607 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
608 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
612 $conv_str = iconv($fromCS,$toCS.'//IGNORE',$str);
613 if (false !== $conv_str) return $conv_str;
617 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
618 if (false !== $conv_str) return $conv_str;
621 // fallback to TYPO3 conversion
624 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
625 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
630 * Convert all elements in ARRAY from one charset to another charset.
631 * NOTICE: Array is passed by reference!
633 * @param string Input array, possibly multidimensional
634 * @param string From charset (the current charset of the string)
635 * @param string To charset (the output charset wanted)
636 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
640 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
641 foreach($array as $key => $value) {
642 if (is_array($array[$key])) {
643 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
645 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
651 * Converts $str from $charset to UTF-8
653 * @param string String in local charset to convert to UTF-8
654 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
655 * @return string Output string, converted to UTF-8
657 function utf8_encode($str,$charset) {
659 if ($charset === 'utf-8') return $str;
661 // Charset is case-insensitive.
662 if ($this->initCharset($charset)) { // Parse conv. table if not already...
663 $strLen = strlen($str);
666 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in string.
667 $chr=substr($str,$a,1);
669 if (isset($this->twoByteSets
[$charset])) { // If the charset has two bytes per char
670 $ord2 = ord($str{$a+
1});
671 $ord = $ord<<8 |
$ord2; // assume big endian
673 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
674 $outStr.=$this->parsedCharsets
[$charset]['local'][$ord];
675 } else $outStr.=chr($this->noCharByteVal
); // No char exists
677 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
678 if (isset($this->eucBasedSets
[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
679 if ($charset != 'shift_jis' ||
($ord < 0xA0 ||
$ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
681 $ord2=ord(substr($str,$a,1));
682 $ord = $ord*256+
$ord2;
686 if (isset($this->parsedCharsets
[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
687 $outStr.= $this->parsedCharsets
[$charset]['local'][$ord];
688 } else $outStr.= chr($this->noCharByteVal
); // No char exists
689 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
696 * Converts $str from UTF-8 to $charset
698 * @param string String in UTF-8 to convert to local charset
699 * @param string Charset, lowercase. Must be found in csconvtbl/ folder.
700 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
701 * @return string Output string, converted to local charset
703 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
705 // Charset is case-insensitive.
706 if ($this->initCharset($charset)) { // Parse conv. table if not already...
707 $strLen = strlen($str);
710 for ($a=0,$i=0;$a<$strLen;$a++
,$i++
) { // Traverse each char in UTF-8 string.
711 $chr=substr($str,$a,1);
713 if ($ord>127) { // This means multibyte! (first byte!)
714 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
716 $buf=$chr; // Add first byte
717 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
718 $ord = $ord << 1; // Shift it left and ...
719 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
720 $a++
; // Increase pointer...
721 $buf.=substr($str,$a,1); // ... and add the next char.
725 if (isset($this->parsedCharsets
[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
726 $mByte = $this->parsedCharsets
[$charset]['utf8'][$buf]; // The local number
727 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
728 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
729 } else $outStr.= chr($mByte);
730 } elseif ($useEntityForNoChar) { // Create num entity:
731 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
732 } else $outStr.=chr($this->noCharByteVal
); // No char exists
733 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
734 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
741 * Converts all chars > 127 to numeric entities.
743 * @param string Input string
744 * @return string Output string
746 function utf8_to_entities($str) {
747 $strLen = strlen($str);
750 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
751 $chr=substr($str,$a,1);
753 if ($ord>127) { // This means multibyte! (first byte!)
754 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
755 $buf=$chr; // Add first byte
756 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
757 $ord = $ord << 1; // Shift it left and ...
758 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
759 $a++
; // Increase pointer...
760 $buf.=substr($str,$a,1); // ... and add the next char.
764 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
765 } else $outStr.=chr($this->noCharByteVal
); // No char exists (MIDDLE of MB sequence!)
766 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
773 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars
775 * @param string Input string, UTF-8
776 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well)
777 * @return string Output string
779 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
780 if ($alsoStdHtmlEnt) {
781 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES
)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
784 $token = md5(microtime());
785 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
786 foreach($parts as $k => $v) {
788 if (substr($v,0,1)=='#') { // Dec or hex entities:
789 if (substr($v,1,1)=='x') {
790 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
792 $parts[$k] = $this->UnumberToChar(substr($v,1));
794 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
795 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
796 } else { // No conversion:
797 $parts[$k] ='&'.$v.';';
802 return implode('',$parts);
806 * Converts all chars in the input UTF-8 string into integer numbers returned in an array
808 * @param string Input string, UTF-8
809 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters.
810 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
811 * @return array Output array with the char numbers
813 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
814 // If entities must be registered as well...:
816 $str = $this->entities_to_utf8($str,1);
819 $strLen = strlen($str);
822 for ($a=0;$a<$strLen;$a++
) { // Traverse each char in UTF-8 string.
823 $chr=substr($str,$a,1);
825 if ($ord>127) { // This means multibyte! (first byte!)
826 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
827 $buf=$chr; // Add first byte
828 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
829 $ord = $ord << 1; // Shift it left and ...
830 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
831 $a++
; // Increase pointer...
832 $buf.=substr($str,$a,1); // ... and add the next char.
836 $outArr[]=$retChar?
$buf:$this->utf8CharToUnumber($buf);
837 } else $outArr[]=$retChar?
chr($this->noCharByteVal
):$this->noCharByteVal
; // No char exists (MIDDLE of MB sequence!)
838 } else $outArr[]=$retChar?
chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
845 * Converts a UNICODE number to a UTF-8 multibyte character
846 * Algorithm based on script found at From: http://czyborra.com/utf/
847 * Unit-tested by Kasper
849 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
851 * bytes | bits | representation
853 * 2 | 11 | 110vvvvv 10vvvvvv
854 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
855 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
856 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
857 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
859 * @param integer UNICODE integer
860 * @return string UTF-8 multibyte character string
861 * @see utf8CharToUnumber()
863 function UnumberToChar($cbyte) {
868 } else if ($cbyte < 0x800) {
869 $str.=chr(0xC0 |
($cbyte >> 6));
870 $str.=chr(0x80 |
($cbyte & 0x3F));
871 } else if ($cbyte < 0x10000) {
872 $str.=chr(0xE0 |
($cbyte >> 12));
873 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
874 $str.=chr(0x80 |
($cbyte & 0x3F));
875 } else if ($cbyte < 0x200000) {
876 $str.=chr(0xF0 |
($cbyte >> 18));
877 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
878 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
879 $str.=chr(0x80 |
($cbyte & 0x3F));
880 } else if ($cbyte < 0x4000000) {
881 $str.=chr(0xF8 |
($cbyte >> 24));
882 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
883 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
884 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
885 $str.=chr(0x80 |
($cbyte & 0x3F));
886 } else if ($cbyte < 0x80000000) {
887 $str.=chr(0xFC |
($cbyte >> 30));
888 $str.=chr(0x80 |
(($cbyte >> 24) & 0x3F));
889 $str.=chr(0x80 |
(($cbyte >> 18) & 0x3F));
890 $str.=chr(0x80 |
(($cbyte >> 12) & 0x3F));
891 $str.=chr(0x80 |
(($cbyte >> 6) & 0x3F));
892 $str.=chr(0x80 |
($cbyte & 0x3F));
893 } else { // Cannot express a 32-bit character in UTF-8
894 $str .= chr($this->noCharByteVal
);
900 * Converts a UTF-8 Multibyte character to a UNICODE number
901 * Unit-tested by Kasper
903 * @param string UTF-8 multibyte character string
904 * @param boolean If set, then a hex. number is returned.
905 * @return integer UNICODE integer
906 * @see UnumberToChar()
908 function utf8CharToUnumber($str,$hex=0) {
909 $ord=ord(substr($str,0,1)); // First char
911 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
913 for ($b=0;$b<8;$b++
) { // for each byte in multibyte string...
914 $ord = $ord << 1; // Shift it left and ...
915 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
916 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+
1,1))),-6);
919 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
921 $int = bindec($binBuf);
924 return $hex ?
'x'.dechex($int) : $int;
935 /********************************************
939 ********************************************/
942 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
943 * This function is automatically called by the conversion functions
945 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
947 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
948 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
951 function initCharset($charset) {
952 // Only process if the charset is not yet loaded:
953 if (!is_array($this->parsedCharsets
[$charset])) {
955 // Conversion table filename:
956 $charsetConvTableFile = PATH_t3lib
.'csconvtbl/'.$charset.'.tbl';
958 // If the conversion table is found:
959 if ($charset && t3lib_div
::validPathStr($charsetConvTableFile) && @is_file
($charsetConvTableFile)) {
960 // Cache file for charsets:
961 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
962 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
963 if ($cacheFile && @is_file
($cacheFile)) {
964 $this->parsedCharsets
[$charset]=unserialize(t3lib_div
::getUrl($cacheFile));
966 // Parse conversion table into lines:
967 $lines=t3lib_div
::trimExplode(chr(10),t3lib_div
::getUrl($charsetConvTableFile),1);
968 // Initialize the internal variable holding the conv. table:
969 $this->parsedCharsets
[$charset]=array('local'=>array(),'utf8'=>array());
970 // traverse the lines:
972 foreach($lines as $value) {
973 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
975 // Detect type if not done yet: (Done on first real line)
976 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
977 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ?
'whitespaced' : 'ms-token';
979 if ($detectedType=='ms-token') {
980 list($hexbyte,$utf8) = split('=|:',$value,3);
981 } elseif ($detectedType=='whitespaced') {
983 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
985 $utf8 = 'U+'.$regA[2];
987 $decval = hexdec(trim($hexbyte));
989 $utf8decval = hexdec(substr(trim($utf8),2));
990 $this->parsedCharsets
[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
991 $this->parsedCharsets
[$charset]['utf8'][$this->parsedCharsets
[$charset]['local'][$decval]]=$decval;
996 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets
[$charset]));
1000 } else return false;
1005 * This function initializes all UTF-8 character data tables.
1007 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1009 * @param string Mode ("case", "ascii", ...)
1010 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1013 function initUnicodeData($mode=null) {
1015 $cacheFileCase = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1016 $cacheFileASCII = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1018 // Only process if the tables are not yet loaded
1021 if (is_array($this->caseFolding
['utf-8'])) return 1;
1023 // Use cached version if possible
1024 if ($cacheFileCase && @is_file
($cacheFileCase)) {
1025 $this->caseFolding
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileCase));
1031 if (is_array($this->toASCII
['utf-8'])) return 1;
1033 // Use cached version if possible
1034 if ($cacheFileASCII && @is_file
($cacheFileASCII)) {
1035 $this->toASCII
['utf-8'] = unserialize(t3lib_div
::getUrl($cacheFileASCII));
1041 // process main Unicode data file
1042 $unicodeDataFile = PATH_t3lib
.'unidata/UnicodeData.txt';
1043 if (!(t3lib_div
::validPathStr($unicodeDataFile) && @is_file
($unicodeDataFile))) return false;
1045 $fh = fopen($unicodeDataFile,'rb');
1046 if (!$fh) return false;
1048 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1049 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1050 $this->caseFolding
['utf-8'] = array();
1051 $utf8CaseFolding =& $this->caseFolding
['utf-8']; // a shorthand
1052 $utf8CaseFolding['toUpper'] = array();
1053 $utf8CaseFolding['toLower'] = array();
1054 $utf8CaseFolding['toTitle'] = array();
1056 $decomposition = array(); // array of temp. decompositions
1057 $mark = array(); // array of chars that are marks (eg. composing accents)
1058 $number = array(); // array of chars that are numbers (eg. digits)
1059 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1061 while (!feof($fh)) {
1062 $line = fgets($fh,4096);
1063 // has a lot of info
1064 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
1066 $ord = hexdec($char);
1067 if ($ord > 0xFFFF) break; // only process the BMP
1069 $utf8_char = $this->UnumberToChar($ord);
1071 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1072 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1073 // store "title" only when different from "upper" (only a few)
1074 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1077 case 'M': // mark (accent, umlaut, ...)
1078 $mark["U+$char"] = 1;
1081 case 'N': // numeric value
1082 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
1085 // accented Latin letters without "official" decomposition
1087 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
1088 $c = ord($match[2]);
1089 if ($match[1] == 'SMALL') $c +
= 32;
1091 $decomposition["U+$char"] = array(dechex($c));
1096 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
1098 case '<circle>': // add parenthesis as circle replacement, eg (1)
1099 $match[2] = '0028 '.$match[2].' 0029';
1102 case '<square>': // add square brackets as square replacement, eg [1]
1103 $match[2] = '005B '.$match[2].' 005D';
1106 case '<compat>': // ignore multi char decompositions that start with a space
1107 if (ereg('^0020 ',$match[2])) continue 2;
1110 // ignore Arabic and vertical layout presentation decomposition
1118 $decomposition["U+$char"] = split(' ',$match[2]);
1123 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1124 $specialCasingFile = PATH_t3lib
.'unidata/SpecialCasing.txt';
1125 if (t3lib_div
::validPathStr($specialCasingFile) && @is_file
($specialCasingFile)) {
1126 $fh = fopen($specialCasingFile,'rb');
1128 while (!feof($fh)) {
1129 $line = fgets($fh,4096);
1130 if ($line{0} != '#' && trim($line) != '') {
1132 list($char,$lower,$title,$upper,$cond) = t3lib_div
::trimExplode(';', $line);
1133 if ($cond == '' ||
$cond{0} == '#') {
1134 $utf8_char = $this->UnumberToChar(hexdec($char));
1135 if ($char != $lower) {
1136 $arr = split(' ',$lower);
1137 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1138 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
1140 if ($char != $title && $title != $upper) {
1141 $arr = split(' ',$title);
1142 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1143 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
1145 if ($char != $upper) {
1146 $arr = split(' ',$upper);
1147 for ($i=0; isset($arr[$i]); $i++
) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1148 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
1157 // process custom decompositions
1158 $customTranslitFile = PATH_t3lib
.'unidata/Translit.txt';
1159 if (t3lib_div
::validPathStr($customTranslitFile) && @is_file
($customTranslitFile)) {
1160 $fh = fopen($customTranslitFile,'rb');
1162 while (!feof($fh)) {
1163 $line = fgets($fh,4096);
1164 if ($line{0} != '#' && trim($line) != '') {
1165 list($char,$translit) = t3lib_div
::trimExplode(';', $line);
1166 if (!$translit) $omit["U+$char"] = 1;
1167 $decomposition["U+$char"] = split(' ', $translit);
1175 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1176 foreach($decomposition as $from => $to) {
1177 $code_decomp = array();
1179 while ($code_value = array_shift($to)) {
1180 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1181 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
1182 array_unshift($to, $cv);
1184 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1185 array_push($code_decomp, $code_value);
1188 if (count($code_decomp) ||
isset($omit[$from])) {
1189 $decomposition[$from] = $code_decomp;
1191 unset($decomposition[$from]);
1195 // create ascii only mapping
1196 $this->toASCII
['utf-8'] = array();
1197 $ascii =& $this->toASCII
['utf-8'];
1199 foreach($decomposition as $from => $to) {
1200 $code_decomp = array();
1201 while ($code_value = array_shift($to)) {
1202 $ord = hexdec($code_value);
1204 continue 2; // skip decompositions containing non-ASCII chars
1206 array_push($code_decomp,chr($ord));
1208 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
1211 // add numeric decompositions
1212 foreach($number as $from => $to) {
1213 $utf8_char = $this->UnumberToChar(hexdec($from));
1214 if (!isset($ascii[$utf8_char])) {
1215 $ascii[$utf8_char] = $to;
1219 if ($cacheFileCase) {
1220 t3lib_div
::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
1223 if ($cacheFileASCII) {
1224 t3lib_div
::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
1231 * This function initializes the folding table for a charset other than UTF-8.
1232 * This function is automatically called by the case folding functions.
1234 * @param string Charset for which to initialize case folding.
1235 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1238 function initCaseFolding($charset) {
1239 // Only process if the case table is not yet loaded:
1240 if (is_array($this->caseFolding
[$charset])) return 1;
1242 // Use cached version if possible
1243 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
1244 if ($cacheFile && @is_file
($cacheFile)) {
1245 $this->caseFolding
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1249 // init UTF-8 conversion for this charset
1250 if (!$this->initCharset($charset)) {
1254 // UTF-8 case folding is used as the base conversion table
1255 if (!$this->initUnicodeData('case')) {
1259 $nochar = chr($this->noCharByteVal
);
1260 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1261 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1262 $c = $this->utf8_decode($utf8, $charset);
1264 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1265 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toUpper'][$utf8], $charset);
1266 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toUpper'][$c] = $cc;
1268 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1269 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toLower'][$utf8], $charset);
1270 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toLower'][$c] = $cc;
1272 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1273 $cc = $this->utf8_decode($this->caseFolding
['utf-8']['toTitle'][$utf8], $charset);
1274 if ($cc != '' && $cc != $nochar) $this->caseFolding
[$charset]['toTitle'][$c] = $cc;
1277 // add the ASCII case table
1278 for ($i=ord('a'); $i<=ord('z'); $i++
) {
1279 $this->caseFolding
[$charset]['toUpper'][chr($i)] = chr($i-32);
1281 for ($i=ord('A'); $i<=ord('Z'); $i++
) {
1282 $this->caseFolding
[$charset]['toLower'][chr($i)] = chr($i+
32);
1286 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding
[$charset]));
1293 * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1294 * This function is automatically called by the ASCII transliteration functions.
1296 * @param string Charset for which to initialize conversion.
1297 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1300 function initToASCII($charset) {
1301 // Only process if the case table is not yet loaded:
1302 if (is_array($this->toASCII
[$charset])) return 1;
1304 // Use cached version if possible
1305 $cacheFile = t3lib_div
::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
1306 if ($cacheFile && @is_file
($cacheFile)) {
1307 $this->toASCII
[$charset] = unserialize(t3lib_div
::getUrl($cacheFile));
1311 // init UTF-8 conversion for this charset
1312 if (!$this->initCharset($charset)) {
1316 // UTF-8/ASCII transliteration is used as the base conversion table
1317 if (!$this->initUnicodeData('ascii')) {
1321 $nochar = chr($this->noCharByteVal
);
1322 foreach ($this->parsedCharsets
[$charset]['local'] as $ci => $utf8) {
1323 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1324 $c = $this->utf8_decode($utf8, $charset);
1326 if (isset($this->toASCII
['utf-8'][$utf8])) {
1327 $this->toASCII
[$charset][$c] = $this->toASCII
['utf-8'][$utf8];
1332 t3lib_div
::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII
[$charset]));
1353 /********************************************
1355 * String operation functions
1357 ********************************************/
1360 * Returns a part of a string.
1361 * Unit-tested by Kasper (single byte charsets only)
1363 * @param string The character set
1364 * @param string Character string
1365 * @param integer Start position (character position)
1366 * @param integer Length (in characters)
1367 * @return string The substring
1368 * @see substr(), mb_substr()
1369 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1371 function substr($charset,$string,$start,$len=null) {
1372 if ($len===0) return '';
1374 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1375 // cannot omit $len, when specifying charset
1377 $enc = mb_internal_encoding(); // save internal encoding
1378 mb_internal_encoding($charset);
1379 $str = mb_substr($string,$start);
1380 mb_internal_encoding($enc); // restore internal encoding
1385 return mb_substr($string,$start,$len,$charset);
1387 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1388 // cannot omit $len, when specifying charset
1390 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1391 iconv_set_encoding('internal_encoding',$charset);
1392 $str = iconv_substr($string,$start);
1393 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding
1398 return iconv_substr($string,$start,$len,$charset);
1400 } elseif ($charset == 'utf-8') {
1401 return $this->utf8_substr($string,$start,$len);
1402 } elseif ($this->eucBasedSets
[$charset]) {
1403 return $this->euc_substr($string,$start,$charset,$len);
1404 } elseif ($this->twoByteSets
[$charset]) {
1405 return substr($string,$start*2,$len*2);
1406 } elseif ($this->fourByteSets
[$charset]) {
1407 return substr($string,$start*4,$len*4);
1410 // treat everything else as single-byte encoding
1411 return $len === NULL ?
substr($string,$start) : substr($string,$start,$len);
1415 * Counts the number of characters.
1416 * Unit-tested by Kasper (single byte charsets only)
1418 * @param string The character set
1419 * @param string Character string
1420 * @return integer The number of characters
1422 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1424 function strlen($charset,$string) {
1425 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1426 return mb_strlen($string,$charset);
1427 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1428 return iconv_strlen($string,$charset);
1429 } elseif ($charset == 'utf-8') {
1430 return $this->utf8_strlen($string);
1431 } elseif ($this->eucBasedSets
[$charset]) {
1432 return $this->euc_strlen($string,$charset);
1433 } elseif ($this->twoByteSets
[$charset]) {
1434 return strlen($string)/2;
1435 } elseif ($this->fourByteSets
[$charset]) {
1436 return strlen($string)/4;
1438 // treat everything else as single-byte encoding
1439 return strlen($string);
1443 * Truncates a string and pre-/appends a string.
1444 * Unit tested by Kasper
1446 * @param string The character set
1447 * @param string Character string
1448 * @param integer Length (in characters)
1449 * @param string Crop signifier
1450 * @return string The shortened string
1451 * @see substr(), mb_strimwidth()
1452 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1454 function crop($charset,$string,$len,$crop='') {
1455 if (intval($len) == 0) return $string;
1457 if ($charset == 'utf-8') {
1458 $i = $this->utf8_char2byte_pos($string,$len);
1459 } elseif ($this->eucBasedSets
[$charset]) {
1460 $i = $this->euc_char2byte_pos($string,$len,$charset);
1465 $i = strlen($string)+
$len;
1466 if ($i<=0) $i = false;
1470 if ($i === false) { // $len outside actual string length
1474 if (strlen($string{$i})) {
1475 return substr($string,0,$i).$crop;
1479 if (strlen($string{$i-1})) {
1480 return $crop.substr($string,$i);
1485 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
1487 return substr($string,0,$i).$crop;
1489 return $crop.substr($string,$i);
1498 * Cuts a string short at a given byte length.
1500 * @param string The character set
1501 * @param string Character string
1502 * @param integer The byte length
1503 * @return string The shortened string
1505 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1507 function strtrunc($charset,$string,$len) {
1508 if ($len <= 0) return '';
1510 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1511 return mb_strcut($string,0,$len,$charset);
1512 } elseif ($charset == 'utf-8') {
1513 return $this->utf8_strtrunc($string,$len);
1514 } elseif ($this->eucBasedSets
[$charset]) {
1515 return $this->euc_strtrunc($string,$charset);
1516 } elseif ($this->twoByteSets
[$charset]) {
1517 if ($len %
2) $len--; // don't cut at odd positions
1518 } elseif ($this->fourByteSets
[$charset]) {
1520 $len -= $x; // realign to position dividable by four
1522 // treat everything else as single-byte encoding
1523 return substr($string,0,$len);
1527 * Translates all characters of a string into their respective case values.
1528 * Unlike strtolower() and strtoupper() this method is locale independent.
1529 * Note that the string length may change!
1530 * eg. lower case German �(sharp S) becomes upper case "SS"
1531 * Unit-tested by Kasper
1532 * Real case folding is language dependent, this method ignores this fact.
1534 * @param string Character set of string
1535 * @param string Input string to convert case for
1536 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1537 * @return string The converted string
1538 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1539 * @see strtolower(), strtoupper()
1541 function conv_case($charset,$string,$case) {
1542 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
1543 if ($case == 'toLower') {
1544 $string = mb_strtolower($string,$charset);
1546 $string = mb_strtoupper($string,$charset);
1548 } elseif ($charset == 'utf-8') {
1549 $string = $this->utf8_char_mapping($string,'case',$case);
1550 } elseif (isset($this->eucBasedSets
[$charset])) {
1551 $string = $this->euc_char_mapping($string,$charset,'case',$case);
1553 // treat everything else as single-byte encoding
1554 $string = $this->sb_char_mapping($string,$charset,'case',$case);
1561 * Converts special chars (like ���, umlauts etc) to ascii equivalents (usually double-bytes, like �=> ae etc.)
1563 * @param string Character set of string
1564 * @param string Input string to convert
1565 * @return string The converted string
1567 function specCharsToASCII($charset,$string) {
1568 if ($charset == 'utf-8') {
1569 $string = $this->utf8_char_mapping($string,'ascii');
1570 } elseif (isset($this->eucBasedSets
[$charset])) {
1571 $string = $this->euc_char_mapping($string,$charset,'ascii');
1573 // treat everything else as single-byte encoding
1574 $string = $this->sb_char_mapping($string,$charset,'ascii');
1591 /********************************************
1593 * Internal string operation functions
1595 ********************************************/
1598 * Maps all characters of a string in a single byte charset.
1600 * @param string the string
1601 * @param string the charset
1602 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1603 * @param string 'case': conversion 'toLower' or 'toUpper'
1604 * @return string the converted string
1605 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1607 function sb_char_mapping($str,$charset,$mode,$opt='') {
1610 if (!$this->initCaseFolding($charset)) return $str; // do nothing
1611 $map =& $this->caseFolding
[$charset][$opt];
1615 if (!$this->initToASCII($charset)) return $str; // do nothing
1616 $map =& $this->toASCII
[$charset];
1624 for($i=0; strlen($str{$i}); $i++
) {
1626 if (isset($map[$c])) {
1645 /********************************************
1647 * Internal UTF-8 string operation functions
1649 ********************************************/
1652 * Returns a part of a UTF-8 string.
1653 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1655 * @param string UTF-8 string
1656 * @param integer Start position (character position)
1657 * @param integer Length (in characters)
1658 * @return string The substring
1660 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1662 function utf8_substr($str,$start,$len=null) {
1663 if (!strcmp($len,'0')) return '';
1665 $byte_start = $this->utf8_char2byte_pos($str,$start);
1666 if ($byte_start === false) {
1668 return false; // $start outside string length
1674 $str = substr($str,$byte_start);
1677 $byte_end = $this->utf8_char2byte_pos($str,$len);
1678 if ($byte_end === false) // $len outside actual string length
1679 return $len<0 ?
'' : $str; // When length is less than zero and exceeds, then we return blank string.
1681 return substr($str,0,$byte_end);
1687 * Counts the number of characters of a string in UTF-8.
1688 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1690 * @param string UTF-8 multibyte character string
1691 * @return integer The number of characters
1693 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1695 function utf8_strlen($str) {
1697 for($i=0; strlen($str{$i}); $i++
) {
1699 if (!($c & 0x80)) // single-byte (0xxxxxx)
1701 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1708 * Truncates a string in UTF-8 short at a given byte length.
1710 * @param string UTF-8 multibyte character string
1711 * @param integer the byte length
1712 * @return string the shortened string
1714 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1716 function utf8_strtrunc($str,$len) {
1718 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1719 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte
1720 if ($i <= 0) return ''; // sanity check
1721 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++
; // calculate number of bytes
1722 if ($bc+
$i > $len) return substr($str,0,$i);
1723 // fallthru: multibyte char fits into length
1725 return substr($str,0,$len);
1729 * Find position of first occurrence of a string, both arguments are in UTF-8.
1731 * @param string UTF-8 string to search in
1732 * @param string UTF-8 string to search for
1733 * @param integer Positition to start the search
1734 * @return integer The character position
1736 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1738 function utf8_strpos($haystack,$needle,$offset=0) {
1739 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1740 return mb_strpos($haystack,$needle,$offset,'utf-8');
1741 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1742 return iconv_strpos($haystack,$needle,$offset,'utf-8');
1745 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
1746 if ($byte_offset === false) return false; // offset beyond string length
1748 $byte_pos = strpos($haystack,$needle,$byte_offset);
1749 if ($byte_pos === false) return false; // needle not found
1751 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1755 * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1757 * @param string UTF-8 string to search in
1758 * @param string UTF-8 character to search for (single character)
1759 * @return integer The character position
1761 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1763 function utf8_strrpos($haystack,$needle) {
1764 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1765 return mb_strrpos($haystack,$needle,'utf-8');
1766 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1767 return iconv_strrpos($haystack,$needle,'utf-8');
1770 $byte_pos = strrpos($haystack,$needle);
1771 if ($byte_pos === false) return false; // needle not found
1773 return $this->utf8_byte2char_pos($haystack,$byte_pos);
1777 * Translates a character position into an 'absolute' byte position.
1778 * Unit tested by Kasper.
1780 * @param string UTF-8 string
1781 * @param integer Character position (negative values start from the end)
1782 * @return integer Byte position
1783 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1785 function utf8_char2byte_pos($str,$pos) {
1786 $n = 0; // number of characters found
1787 $p = abs($pos); // number of characters wanted
1793 $i = strlen($str)-1;
1797 for( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
1798 $c = (int)ord($str{$i});
1799 if (!($c & 0x80)) // single-byte (0xxxxxx)
1801 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1804 if (!strlen($str{$i})) return false; // offset beyond string length
1807 // skip trailing multi-byte data bytes
1808 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++
; }
1818 * Translates an 'absolute' byte position into a character position.
1819 * Unit tested by Kasper.
1821 * @param string UTF-8 string
1822 * @param integer byte position
1823 * @return integer character position
1824 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1826 function utf8_byte2char_pos($str,$pos) {
1827 $n = 0; // number of characters
1828 for($i=$pos; $i>0; $i--) {
1829 $c = (int)ord($str{$i});
1830 if (!($c & 0x80)) // single-byte (0xxxxxx)
1832 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1835 if (!strlen($str{$i})) return false; // offset beyond string length
1841 * Maps all characters of an UTF-8 string.
1843 * @param string UTF-8 string
1844 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1845 * @param string 'case': conversion 'toLower' or 'toUpper'
1846 * @return string the converted string
1847 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1849 function utf8_char_mapping($str,$mode,$opt='') {
1850 if (!$this->initUnicodeData($mode)) return $str; // do nothing
1855 $map =& $this->caseFolding
['utf-8'][$opt];
1859 $map =& $this->toASCII
['utf-8'];
1866 for($i=0; strlen($str{$i}); $i++
) {
1868 if (!($c & 0x80)) // single-byte (0xxxxxx)
1870 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
1871 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++
; } // calculate number of bytes
1872 $mbc = substr($str,$i,$bc);
1876 if (isset($map[$mbc])) {
1903 /********************************************
1905 * Internal EUC string operation functions
1907 * Extended Unix Code:
1908 * ASCII compatible 7bit single bytes chars
1909 * 8bit two byte chars
1911 * Shift-JIS is treated as a special case.
1913 ********************************************/
1916 * Cuts a string in the EUC charset family short at a given byte length.
1918 * @param string EUC multibyte character string
1919 * @param integer the byte length
1920 * @param string the charset
1921 * @return string the shortened string
1923 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1925 function euc_strtrunc($str,$len,$charset) {
1926 $sjis = ($charset == 'shift_jis');
1927 for ($i=0; strlen($str{$i}) && $i<$len; $i++
) {
1930 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1933 if ($c >= 0x80) $i++
; // advance a double-byte char
1936 if (!strlen($str{$i})) return $str; // string shorter than supplied length
1939 return substr($str,0,$len-1); // we ended on a first byte
1941 return substr($str,0,$len);
1945 * Returns a part of a string in the EUC charset family.
1947 * @param string EUC multibyte character string
1948 * @param integer start position (character position)
1949 * @param string the charset
1950 * @param integer length (in characters)
1951 * @return string the substring
1952 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1954 function euc_substr($str,$start,$charset,$len=null) {
1955 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
1956 if ($byte_start === false) return false; // $start outside string length
1958 $str = substr($str,$byte_start);
1961 $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
1962 if ($byte_end === false) // $len outside actual string length
1965 return substr($str,0,$byte_end);
1971 * Counts the number of characters of a string in the EUC charset family.
1973 * @param string EUC multibyte character string
1974 * @param string the charset
1975 * @return integer the number of characters
1977 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
1979 function euc_strlen($str,$charset) {
1980 $sjis = ($charset == 'shift_jis');
1982 for ($i=0; strlen($str{$i}); $i++
) {
1985 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i++
; // advance a double-byte char
1988 if ($c >= 0x80) $i++
; // advance a double-byte char
1998 * Translates a character position into an 'absolute' byte position.
2000 * @param string EUC multibyte character string
2001 * @param integer character position (negative values start from the end)
2002 * @param string the charset
2003 * @return integer byte position
2004 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2006 function euc_char2byte_pos($str,$pos,$charset) {
2007 $sjis = ($charset == 'shift_jis');
2008 $n = 0; // number of characters seen
2009 $p = abs($pos); // number of characters wanted
2015 $i = strlen($str)-1;
2019 for ( ; strlen($str{$i}) && $n<$p; $i+
=$d) {
2022 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) $i+
=$d; // advance a double-byte char
2025 if ($c >= 0x80) $i+
=$d; // advance a double-byte char
2030 if (!strlen($str{$i})) return false; // offset beyond string length
2032 if ($pos < 0) $i++
; // correct offset
2038 * Maps all characters of a string in the EUC charset family.
2040 * @param string EUC multibyte character string
2041 * @param string the charset
2042 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2043 * @param string 'case': conversion 'toLower' or 'toUpper'
2044 * @return string the converted string
2045 * @author Martin Kutschker <martin.t.kutschker@blackbox.net>
2047 function euc_char_mapping($str,$charset,$mode,$opt='') {
2050 if (!$this->initCaseFolding($charset)) return $str; // do nothing
2051 $map =& $this->caseFolding
[$charset][$opt];
2055 if (!$this->initToASCII($charset)) return $str; // do nothing
2056 $map =& $this->toASCII
[$charset];
2063 $sjis = ($charset == 'shift_jis');
2065 for($i=0; strlen($str{$i}); $i++
) {
2070 if (($c >= 0x80 && $c < 0xA0) ||
($c >= 0xE0)) { // a double-byte char
2071 $mbc = substr($str,$i,2);
2076 if ($c >= 0x80) { // a double-byte char
2077 $mbc = substr($str,$i,2);
2082 if (isset($map[$mbc])) {
2094 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']) {
2095 include_once($TYPO3_CONF_VARS[TYPO3_MODE
]['XCLASS']['t3lib/class.t3lib_cs.php']);